LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
60#include "llvm/IR/Attributes.h"
61#include "llvm/IR/Constants.h"
62#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/IR/Function.h"
67#include "llvm/IR/GlobalValue.h"
68#include "llvm/IR/IRBuilder.h"
69#include "llvm/IR/Instruction.h"
72#include "llvm/IR/Intrinsics.h"
73#include "llvm/IR/IntrinsicsAArch64.h"
74#include "llvm/IR/Module.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add legal sve data types
449 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
451 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
452 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
453
454 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
456 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
457 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
460
461 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
463 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
464
465 if (Subtarget->useSVEForFixedLengthVectors()) {
468 addRegisterClass(VT, &AArch64::ZPRRegClass);
469
472 addRegisterClass(VT, &AArch64::ZPRRegClass);
473 }
474 }
475
476 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
477 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
478 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
479 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
480
481 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
482 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
483 }
484
485 // Compute derived properties from the register classes
487
488 // Provide all sorts of operation actions
514 if (Subtarget->hasFPARMv8()) {
517 }
530
532
536
540
542
543 // Custom lowering hooks are needed for XOR
544 // to fold it into CSINC/CSINV.
547
550
551 // Virtually no operation on f128 is legal, but LLVM can't expand them when
552 // there's a valid register class, so we need custom operations in most cases.
577 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
578 // aren't handled.
579
580 // Lowering for many of the conversions is actually specified by the non-f128
581 // type. The LowerXXX function will be trivial when f128 isn't involved.
606 if (Subtarget->hasFPARMv8()) {
609 }
612 if (Subtarget->hasFPARMv8()) {
615 }
618
623
624 // Variable arguments.
629
630 // Variable-sized objects.
633
634 // Lowering Funnel Shifts to EXTR
639
641
642 // Constant pool entries
644
645 // BlockAddress
647
648 // AArch64 lacks both left-rotate and popcount instructions.
654 }
655
656 // AArch64 doesn't have i32 MULH{S|U}.
659
660 // AArch64 doesn't have {U|S}MUL_LOHI.
665
666 if (Subtarget->hasCSSC()) {
670
672
676
679
684
689 } else {
693
696
699 }
700
706 }
713
714 // Custom lower Add/Sub/Mul with overflow.
727
736
745 if (Subtarget->hasFullFP16()) {
748 } else {
751 }
752
753 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
767 setOperationAction(Op, MVT::f16, Promote);
768 setOperationAction(Op, MVT::v4f16, Expand);
769 setOperationAction(Op, MVT::v8f16, Expand);
770 setOperationAction(Op, MVT::bf16, Promote);
771 setOperationAction(Op, MVT::v4bf16, Expand);
772 setOperationAction(Op, MVT::v8bf16, Expand);
773 }
774
775 // Legalize fcanonicalize to circumvent default expansion
776 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
777 if (Subtarget->hasFullFP16()) {
779 }
780
781 // fpextend from f16 or bf16 to f32 is legal
786 // fpextend from bf16 to f64 needs to be split into two fpextends
789
790 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
791 for (auto Op : {
795 ISD::FADD,
796 ISD::FSUB,
797 ISD::FMUL,
798 ISD::FDIV,
799 ISD::FMA,
832 })
833 setOperationAction(Op, ScalarVT, Promote);
834
835 for (auto Op : {ISD::FNEG, ISD::FABS})
836 setOperationAction(Op, ScalarVT, Legal);
837
838 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
839 // because the result type is integer.
843 setOperationAction(Op, ScalarVT, Custom);
844
845 // promote v4f16 to v4f32 when that is known to be safe.
846 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
847 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
854 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
855 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
856 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
857 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
858 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
859 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
860
869
870 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
871 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
872 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
873
894 };
895
896 if (!Subtarget->hasFullFP16()) {
897 LegalizeNarrowFP(MVT::f16);
898 }
899 LegalizeNarrowFP(MVT::bf16);
902
903 // AArch64 has implementations of a lot of rounding-like FP operations.
904 // clang-format off
905 for (auto Op :
917 for (MVT Ty : {MVT::f32, MVT::f64})
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, MVT::f16, Legal);
921 }
922 // clang-format on
923
924 // Basic strict FP operations are legal
927 for (MVT Ty : {MVT::f32, MVT::f64})
929 if (Subtarget->hasFullFP16())
930 setOperationAction(Op, MVT::f16, Legal);
931 }
932
934
940
942 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
945 } else {
948 }
951
952 // Generate outline atomics library calls only if LSE was not specified for
953 // subtarget
954 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
980 }
981
982 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
987
992
997
1002
1007 }
1008
1009 if (Subtarget->hasLSE128()) {
1010 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1011 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1015 }
1016
1017 // 128-bit loads and stores can be done without expanding
1018 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1020
1021 // Aligned 128-bit loads and stores are single-copy atomic according to the
1022 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1023 if (Subtarget->hasLSE2()) {
1026 }
1027
1028 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1029 // custom lowering, as there are no un-paired non-temporal stores and
1030 // legalization will break up 256 bit inputs.
1031 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1035 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1036 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1037 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1038 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1039
1040 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1041 // custom lowering, as there are no un-paired non-temporal loads legalization
1042 // will break up 256 bit inputs.
1043 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1045 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1046 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1047 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1048 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1049 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1050 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1051
1052 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1054
1055 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057 // Issue __sincos_stret if available.
1060 } else {
1063 }
1064
1065 // Make floating-point constants legal for the large code model, so they don't
1066 // become loads from the constant pool.
1067 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1070 }
1071
1072 // AArch64 does not have floating-point extending loads, i1 sign-extending
1073 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1074 for (MVT VT : MVT::fp_valuetypes()) {
1075 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1076 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1077 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1078 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1079 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1080 }
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1083
1084 for (MVT WideVT : MVT::fp_valuetypes()) {
1085 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1086 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1087 setTruncStoreAction(WideVT, NarrowVT, Expand);
1088 }
1089 }
1090 }
1091
1092 if (Subtarget->hasFPARMv8()) {
1096 }
1097
1098 // Indexed loads and stores are supported.
1099 for (unsigned im = (unsigned)ISD::PRE_INC;
1101 setIndexedLoadAction(im, MVT::i8, Legal);
1102 setIndexedLoadAction(im, MVT::i16, Legal);
1103 setIndexedLoadAction(im, MVT::i32, Legal);
1104 setIndexedLoadAction(im, MVT::i64, Legal);
1105 setIndexedLoadAction(im, MVT::f64, Legal);
1106 setIndexedLoadAction(im, MVT::f32, Legal);
1107 setIndexedLoadAction(im, MVT::f16, Legal);
1108 setIndexedLoadAction(im, MVT::bf16, Legal);
1109 setIndexedStoreAction(im, MVT::i8, Legal);
1110 setIndexedStoreAction(im, MVT::i16, Legal);
1111 setIndexedStoreAction(im, MVT::i32, Legal);
1112 setIndexedStoreAction(im, MVT::i64, Legal);
1113 setIndexedStoreAction(im, MVT::f64, Legal);
1114 setIndexedStoreAction(im, MVT::f32, Legal);
1115 setIndexedStoreAction(im, MVT::f16, Legal);
1116 setIndexedStoreAction(im, MVT::bf16, Legal);
1117 }
1118
1119 // Trap.
1120 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1123
1124 // We combine OR nodes for ccmp operations.
1126 // Try to create BICs for vector ANDs.
1128
1129 // llvm.init.trampoline and llvm.adjust.trampoline
1132
1133 // Vector add and sub nodes may conceal a high-half opportunity.
1134 // Also, try to fold ADD into CSINC/CSINV..
1137
1140
1141 // Try and combine setcc with csel
1143
1145
1153
1155
1157
1159
1163
1166
1168
1170
1172
1174
1178
1180
1182
1183 // In case of strict alignment, avoid an excessive number of byte wide stores.
1186 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1187
1191 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1192
1195 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1196
1199 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1200
1202
1204
1205 EnableExtLdPromotion = true;
1206
1207 // Set required alignment.
1209 // Set preferred alignments.
1210
1211 // Don't align loops on Windows. The SEH unwind info generation needs to
1212 // know the exact length of functions before the alignments have been
1213 // expanded.
1214 if (!Subtarget->isTargetWindows())
1218
1219 // Only change the limit for entries in a jump table if specified by
1220 // the sub target, but not at the command line.
1221 unsigned MaxJT = STI.getMaximumJumpTableSize();
1222 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1224
1226
1228
1230 if (Subtarget->hasSME())
1232
1233 if (Subtarget->isNeonAvailable()) {
1234 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1235 // silliness like this:
1236 // clang-format off
1237 for (auto Op :
1258 setOperationAction(Op, MVT::v1f64, Expand);
1259 // clang-format on
1260
1261 for (auto Op :
1266 setOperationAction(Op, MVT::v1i64, Expand);
1267
1268 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1269 // elements smaller than i32, so promote the input to i32 first.
1270 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1271 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1272
1273 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1274 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1275 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1278 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1280
1281 if (Subtarget->hasFullFP16()) {
1284
1293 } else {
1294 // when AArch64 doesn't have fullfp16 support, promote the input
1295 // to i32 first.
1296 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1297 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1298 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1299 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1300 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1301 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1302 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1303 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1304 }
1305
1306 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1307 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1314 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1319 }
1320
1321 // Custom handling for some quad-vector types to detect MULL.
1322 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1323 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1324 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1325 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1326 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1327 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1328
1329 // Saturates
1330 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1331 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1336 }
1337
1338 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1339 MVT::v4i32}) {
1346 }
1347
1348 // Vector reductions
1349 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1350 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1351 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1356
1358 }
1359 }
1360 if (Subtarget->hasFullFP16())
1362
1363 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1364 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1373 }
1378
1380 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1381 // Likewise, narrowing and extending vector loads/stores aren't handled
1382 // directly.
1385
1386 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1389 } else {
1392 }
1395
1398
1399 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1400 setTruncStoreAction(VT, InnerVT, Expand);
1401 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1403 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1404 }
1405 }
1406
1407 for (auto Op :
1413 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1415 if (Subtarget->hasFullFP16())
1416 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1418 }
1419
1420 // LRINT and LLRINT.
1421 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1422 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1424 if (Subtarget->hasFullFP16())
1425 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1427 }
1428
1429 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1430
1435
1439
1440 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1441 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1442 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1443 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1444 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1445 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1446
1447 // ADDP custom lowering
1448 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1450 // FADDP custom lowering
1451 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1453
1454 if (Subtarget->hasDotProd()) {
1455 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457
1458 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1459 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1461
1462 if (Subtarget->hasMatMulInt8()) {
1464 MVT::v16i8, Legal);
1466 MVT::v16i8, Custom);
1467
1469 MVT::v8i8, Legal);
1470 }
1471 }
1472
1473 } else /* !isNeonAvailable */ {
1475 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1477
1478 if (VT.is128BitVector() || VT.is64BitVector()) {
1482 Subtarget->isLittleEndian() ? Legal : Expand);
1483 }
1484 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1485 setTruncStoreAction(VT, InnerVT, Expand);
1486 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1487 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1489 }
1490 }
1491 }
1492
1493 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1497 }
1498
1499 if (Subtarget->hasSME()) {
1501 }
1502
1503 // FIXME: Move lowering for more nodes here if those are common between
1504 // SVE and SME.
1505 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1506 for (auto VT :
1507 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1512 }
1513 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1516 }
1517
1518 if (Subtarget->hasSVE2p1() ||
1519 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1521
1522 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1524 }
1525
1526 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1527 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1568
1574
1583
1588
1589 if (!Subtarget->isLittleEndian())
1591
1592 if (Subtarget->hasSVE2() ||
1593 (Subtarget->hasSME() && Subtarget->isStreaming()))
1594 // For SLI/SRI.
1596 }
1597
1598 // Illegal unpacked integer vector types.
1599 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1602 }
1603
1604 // Type legalize unpacked bitcasts.
1605 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1607
1608 for (auto VT :
1609 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1610 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1612
1613 for (auto VT :
1614 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1622
1626
1627 // There are no legal MVT::nxv16f## based types.
1628 if (VT != MVT::nxv16i1) {
1633 }
1634 }
1635
1636 // NEON doesn't support masked loads/stores, but SME and SVE do.
1637 for (auto VT :
1638 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1639 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1640 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1643 }
1644
1645 // Firstly, exclude all scalable vector extending loads/truncating stores,
1646 // include both integer and floating scalable vector.
1648 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1649 setTruncStoreAction(VT, InnerVT, Expand);
1650 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1651 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1653 }
1654 }
1655
1656 // Then, selectively enable those which we directly support.
1657 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1658 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1660 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1661 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1662 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1663 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1664 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1665 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1667 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1668 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1669 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1670 }
1671
1672 // SVE supports truncating stores of 64 and 128-bit vectors
1673 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1674 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1676 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1677 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1678
1679 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1680 MVT::nxv4f32, MVT::nxv2f64}) {
1720
1742
1754 }
1755
1756 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1772
1773 if (Subtarget->hasSVEB16B16() &&
1774 Subtarget->isNonStreamingSVEorSME2Available()) {
1783 }
1784 }
1785
1786 for (auto Opcode :
1791 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1792 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1793 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1794 }
1795
1796 if (!Subtarget->hasSVEB16B16() ||
1797 !Subtarget->isNonStreamingSVEorSME2Available()) {
1798 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1800 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1801 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1802 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1803 }
1804 }
1805
1808
1809 // NEON doesn't support integer divides, but SVE does
1810 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1811 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1814 }
1815
1816 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1817 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1818 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1819
1820 // NOTE: Currently this has to happen after computeRegisterProperties rather
1821 // than the preferred option of combining it with the addRegisterClass call.
1822 if (Subtarget->useSVEForFixedLengthVectors()) {
1825 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1826 addTypeForFixedLengthSVE(VT);
1827 }
1830 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1831 addTypeForFixedLengthSVE(VT);
1832 }
1833
1834 // 64bit results can mean a bigger than NEON input.
1835 for (auto VT : {MVT::v8i8, MVT::v4i16})
1838
1839 // 128bit results imply a bigger than NEON input.
1840 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1842 for (auto VT : {MVT::v8f16, MVT::v4f32})
1844
1845 // These operations are not supported on NEON but SVE can do them.
1847 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1848 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1849 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1850 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1851 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1852 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1853 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1854 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1855 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1856 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1857 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1858 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1859 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1860 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1861 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1866
1867 // Int operations with no NEON support.
1868 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1869 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1877 }
1878
1879 // Use SVE for vectors with more than 2 elements.
1880 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1882 }
1883
1884 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1885 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1888
1890
1891 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1893 }
1894
1895 // Handle partial reduction operations
1896 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1897 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1898 // Other pairs will default to 'Expand'.
1899 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1901 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1902 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1903
1904 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1905
1906 if (Subtarget->hasMatMulInt8()) {
1908 MVT::nxv16i8, Legal);
1910 MVT::nxv16i8, Custom);
1911 }
1912
1913 // Wide add types
1914 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1915 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1916 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1918 }
1919 }
1920
1921 // Handle operations that are only available in non-streaming SVE mode.
1922 if (Subtarget->isSVEAvailable()) {
1923 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1924 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1925 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1926 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1927 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1928 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1929 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1932 }
1933
1934 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1935 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1936 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1938
1939 // We can lower types that have <vscale x {2|4}> elements to compact.
1940 for (auto VT :
1941 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1942 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1944
1945 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1946 // NEON vectors in the lowest bits of the SVE register.
1947 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1948 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1950
1951 // Histcnt is SVE2 only
1952 if (Subtarget->hasSVE2()) {
1954 Custom);
1956 Custom);
1957
1958 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1960 // Must be lowered to SVE instructions.
1961 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1962 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1963 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1964 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1965 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1966 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1967 }
1968 }
1969
1970 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1971 // Only required for llvm.aarch64.mops.memset.tag
1973 }
1974
1976
1977 if (Subtarget->hasSVE()) {
1982 }
1983
1984 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1985
1986 IsStrictFPEnabled = true;
1988
1989 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1990 // it, but it's just a wrapper around ldexp.
1991 if (Subtarget->isTargetWindows()) {
1993 if (isOperationExpand(Op, MVT::f32))
1994 setOperationAction(Op, MVT::f32, Promote);
1995 }
1996
1997 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1998 // isn't legal.
2000 if (isOperationExpand(Op, MVT::f16))
2001 setOperationAction(Op, MVT::f16, Promote);
2002}
2003
2005 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2006}
2007
2008void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2009 assert(VT.isVector() && "VT should be a vector type");
2010
2011 if (VT.isFloatingPoint()) {
2013 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2014 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2015 }
2016
2017 // Mark vector float intrinsics as expand.
2018 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2036 }
2037
2038 // But we do support custom-lowering for FCOPYSIGN.
2039 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2040 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2041 VT == MVT::v8f16) &&
2042 Subtarget->hasFullFP16()))
2044
2057
2061 for (MVT InnerVT : MVT::all_valuetypes())
2062 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2063
2064 // CNT supports only B element sizes, then use UADDLP to widen.
2065 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2067
2073
2074 for (unsigned Opcode :
2077 setOperationAction(Opcode, VT, Custom);
2078
2079 if (!VT.isFloatingPoint())
2081
2082 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2083 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2084 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2085 setOperationAction(Opcode, VT, Legal);
2086
2087 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2088 // NEON types.
2089 if (VT.isFloatingPoint() &&
2090 VT.getVectorElementType() != MVT::bf16 &&
2091 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2092 for (unsigned Opcode :
2098 setOperationAction(Opcode, VT, Legal);
2099
2100 // Strict fp extend and trunc are legal
2101 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2103 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2105
2106 // FIXME: We could potentially make use of the vector comparison instructions
2107 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2108 // complications:
2109 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2110 // so we would need to expand when the condition code doesn't match the
2111 // kind of comparison.
2112 // * Some kinds of comparison require more than one FCMXY instruction so
2113 // would need to be expanded instead.
2114 // * The lowering of the non-strict versions involves target-specific ISD
2115 // nodes so we would likely need to add strict versions of all of them and
2116 // handle them appropriately.
2119
2120 // When little-endian we can use ordinary d and q register loads/stores for
2121 // vector types, but when big-endian we need to use structure load/store which
2122 // only allow post-index addressing.
2123 if (Subtarget->isLittleEndian()) {
2124 for (unsigned im = (unsigned)ISD::PRE_INC;
2128 }
2129 } else {
2132 }
2133
2134 if (Subtarget->hasD128()) {
2137 }
2138
2139 if (VT.isInteger()) {
2140 // Let common code emit inverted variants of compares we do support.
2146 }
2147}
2148
2150 EVT OpVT) const {
2151 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2152 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2153 ResVT.getVectorElementType() != MVT::i1)
2154 return true;
2155
2156 // Only support illegal types if the result is scalable and min elements > 1.
2157 if (ResVT.getVectorMinNumElements() == 1 ||
2158 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2159 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2160 return true;
2161
2162 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2163 // but anything larger should be expanded.
2164 if (OpVT.getFixedSizeInBits() > 64)
2165 return true;
2166
2167 return false;
2168}
2169
2171 const IntrinsicInst *I) const {
2172 assert(I->getIntrinsicID() ==
2173 Intrinsic::experimental_vector_partial_reduce_add &&
2174 "Unexpected intrinsic!");
2175 return true;
2176}
2177
2179 if (!Subtarget->isSVEorStreamingSVEAvailable())
2180 return true;
2181
2182 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2183 // also support fixed-width predicates.
2184 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2185 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2186 VT != MVT::v4i1 && VT != MVT::v2i1;
2187}
2188
2190 unsigned SearchSize) const {
2191 // MATCH is SVE2 and only available in non-streaming mode.
2192 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2193 return true;
2194 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2195 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2196 return SearchSize != 8;
2197 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2198 return SearchSize != 8 && SearchSize != 16;
2199 return true;
2200}
2201
2202void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2203 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2204
2205 // By default everything must be expanded.
2206 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2208
2209 if (VT.isFloatingPoint()) {
2219 }
2220
2222 VT == MVT::v1f64 ? Expand : Custom;
2223
2224 // Mark integer truncating stores/extending loads as having custom lowering
2225 if (VT.isInteger()) {
2226 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2227 while (InnerVT != VT) {
2228 setTruncStoreAction(VT, InnerVT, Default);
2229 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2230 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2231 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2232 InnerVT = InnerVT.changeVectorElementType(
2233 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2234 }
2235 }
2236
2237 // Mark floating-point truncating stores/extending loads as having custom
2238 // lowering
2239 if (VT.isFloatingPoint()) {
2240 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2241 while (InnerVT != VT) {
2242 setTruncStoreAction(VT, InnerVT, Custom);
2243 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2244 InnerVT = InnerVT.changeVectorElementType(
2246 }
2247 }
2248
2249 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2250 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2251
2252 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2254 unsigned NumElts = VT.getVectorNumElements();
2255 if (VT.getVectorElementType() == MVT::i64) {
2256 setPartialReduceMLAAction(MLAOps, VT,
2257 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2258 setPartialReduceMLAAction(MLAOps, VT,
2259 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2260 setPartialReduceMLAAction(MLAOps, VT,
2261 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2262 } else if (VT.getVectorElementType() == MVT::i32) {
2263 setPartialReduceMLAAction(MLAOps, VT,
2264 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2265 setPartialReduceMLAAction(MLAOps, VT,
2266 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2267 } else if (VT.getVectorElementType() == MVT::i16) {
2268 setPartialReduceMLAAction(MLAOps, VT,
2269 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2270 }
2271 if (Subtarget->hasMatMulInt8()) {
2272 if (VT.getVectorElementType() == MVT::i32)
2274 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2275 else if (VT.getVectorElementType() == MVT::i64)
2277 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2278 }
2279
2280 // Lower fixed length vector operations to scalable equivalents.
2287 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2325 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2326 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2328 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2347 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2373}
2374
2375void AArch64TargetLowering::addDRType(MVT VT) {
2376 addRegisterClass(VT, &AArch64::FPR64RegClass);
2377 if (Subtarget->isNeonAvailable())
2378 addTypeForNEON(VT);
2379}
2380
2381void AArch64TargetLowering::addQRType(MVT VT) {
2382 addRegisterClass(VT, &AArch64::FPR128RegClass);
2383 if (Subtarget->isNeonAvailable())
2384 addTypeForNEON(VT);
2385}
2386
2388 LLVMContext &C, EVT VT) const {
2389 if (!VT.isVector())
2390 return MVT::i32;
2391 if (VT.isScalableVector())
2392 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2394}
2395
2396// isIntImmediate - This method tests to see if the node is a constant
2397// operand. If so Imm will receive the value.
2398static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2399 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2400 Imm = C->getZExtValue();
2401 return true;
2402 }
2403 return false;
2404}
2405
2406bool isVectorizedBinOp(unsigned Opcode) {
2407 switch (Opcode) {
2408 case AArch64ISD::SQDMULH:
2409 return true;
2410 default:
2411 return false;
2412 }
2413}
2414
2415// isOpcWithIntImmediate - This method tests to see if the node is a specific
2416// opcode and that it has a immediate integer right operand.
2417// If so Imm will receive the value.
2418static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2419 uint64_t &Imm) {
2420 return N->getOpcode() == Opc &&
2421 isIntImmediate(N->getOperand(1).getNode(), Imm);
2422}
2423
2424static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2425 const APInt &Demanded,
2427 unsigned NewOpc) {
2428 uint64_t OldImm = Imm, NewImm, Enc;
2429 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2430
2431 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2432 // bimm64.
2433 if (Imm == 0 || Imm == Mask ||
2435 return false;
2436
2437 unsigned EltSize = Size;
2438 uint64_t DemandedBits = Demanded.getZExtValue();
2439
2440 // Clear bits that are not demanded.
2441 Imm &= DemandedBits;
2442
2443 while (true) {
2444 // The goal here is to set the non-demanded bits in a way that minimizes
2445 // the number of switching between 0 and 1. In order to achieve this goal,
2446 // we set the non-demanded bits to the value of the preceding demanded bits.
2447 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2448 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2449 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2450 // The final result is 0b11000011.
2451 uint64_t NonDemandedBits = ~DemandedBits;
2452 uint64_t InvertedImm = ~Imm & DemandedBits;
2453 uint64_t RotatedImm =
2454 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2455 NonDemandedBits;
2456 uint64_t Sum = RotatedImm + NonDemandedBits;
2457 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2458 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2459 NewImm = (Imm | Ones) & Mask;
2460
2461 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2462 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2463 // we halve the element size and continue the search.
2464 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2465 break;
2466
2467 // We cannot shrink the element size any further if it is 2-bits.
2468 if (EltSize == 2)
2469 return false;
2470
2471 EltSize /= 2;
2472 Mask >>= EltSize;
2473 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2474
2475 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2476 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2477 return false;
2478
2479 // Merge the upper and lower halves of Imm and DemandedBits.
2480 Imm |= Hi;
2481 DemandedBits |= DemandedBitsHi;
2482 }
2483
2484 ++NumOptimizedImms;
2485
2486 // Replicate the element across the register width.
2487 while (EltSize < Size) {
2488 NewImm |= NewImm << EltSize;
2489 EltSize *= 2;
2490 }
2491
2492 (void)OldImm;
2493 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2494 "demanded bits should never be altered");
2495 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2496
2497 // Create the new constant immediate node.
2498 EVT VT = Op.getValueType();
2499 SDLoc DL(Op);
2500 SDValue New;
2501
2502 // If the new constant immediate is all-zeros or all-ones, let the target
2503 // independent DAG combine optimize this node.
2504 if (NewImm == 0 || NewImm == OrigMask) {
2505 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2506 TLO.DAG.getConstant(NewImm, DL, VT));
2507 // Otherwise, create a machine node so that target independent DAG combine
2508 // doesn't undo this optimization.
2509 } else {
2511 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2512 New = SDValue(
2513 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2514 }
2515
2516 return TLO.CombineTo(Op, New);
2517}
2518
2520 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2521 TargetLoweringOpt &TLO) const {
2522 // Delay this optimization to as late as possible.
2523 if (!TLO.LegalOps)
2524 return false;
2525
2527 return false;
2528
2529 EVT VT = Op.getValueType();
2530 if (VT.isVector())
2531 return false;
2532
2533 unsigned Size = VT.getSizeInBits();
2534
2535 if (Size != 32 && Size != 64)
2536 return false;
2537
2538 // Exit early if we demand all bits.
2539 if (DemandedBits.popcount() == Size)
2540 return false;
2541
2542 unsigned NewOpc;
2543 switch (Op.getOpcode()) {
2544 default:
2545 return false;
2546 case ISD::AND:
2547 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2548 break;
2549 case ISD::OR:
2550 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2551 break;
2552 case ISD::XOR:
2553 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2554 break;
2555 }
2556 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2557 if (!C)
2558 return false;
2559 uint64_t Imm = C->getZExtValue();
2560 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2561}
2562
2563/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2564/// Mask are known to be either zero or one and return them Known.
2566 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2567 const SelectionDAG &DAG, unsigned Depth) const {
2568 switch (Op.getOpcode()) {
2569 default:
2570 break;
2571 case AArch64ISD::DUP: {
2572 SDValue SrcOp = Op.getOperand(0);
2573 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2574 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2575 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2576 "Expected DUP implicit truncation");
2577 Known = Known.trunc(Op.getScalarValueSizeInBits());
2578 }
2579 break;
2580 }
2581 case AArch64ISD::CSEL: {
2582 KnownBits Known2;
2583 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2584 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2585 Known = Known.intersectWith(Known2);
2586 break;
2587 }
2588 case AArch64ISD::CSNEG:
2589 case AArch64ISD::CSINC:
2590 case AArch64ISD::CSINV: {
2591 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2592 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2593
2594 // The result is either:
2595 // CSINC: KnownOp0 or KnownOp1 + 1
2596 // CSINV: KnownOp0 or ~KnownOp1
2597 // CSNEG: KnownOp0 or KnownOp1 * -1
2598 if (Op.getOpcode() == AArch64ISD::CSINC)
2599 KnownOp1 = KnownBits::add(
2600 KnownOp1,
2601 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2602 else if (Op.getOpcode() == AArch64ISD::CSINV)
2603 std::swap(KnownOp1.Zero, KnownOp1.One);
2604 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2605 KnownOp1 =
2607 Op.getScalarValueSizeInBits())));
2608
2609 Known = KnownOp0.intersectWith(KnownOp1);
2610 break;
2611 }
2612 case AArch64ISD::BICi: {
2613 // Compute the bit cleared value.
2614 APInt Mask =
2615 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2616 .trunc(Known.getBitWidth());
2617 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2618 Known &= KnownBits::makeConstant(Mask);
2619 break;
2620 }
2621 case AArch64ISD::VLSHR: {
2622 KnownBits Known2;
2623 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2624 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2625 Known = KnownBits::lshr(Known, Known2);
2626 break;
2627 }
2628 case AArch64ISD::VASHR: {
2629 KnownBits Known2;
2630 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2631 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2632 Known = KnownBits::ashr(Known, Known2);
2633 break;
2634 }
2635 case AArch64ISD::VSHL: {
2636 KnownBits Known2;
2637 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2638 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2639 Known = KnownBits::shl(Known, Known2);
2640 break;
2641 }
2642 case AArch64ISD::MOVI: {
2644 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2645 break;
2646 }
2647 case AArch64ISD::MOVIshift: {
2649 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2650 << Op->getConstantOperandVal(1)));
2651 break;
2652 }
2653 case AArch64ISD::LOADgot:
2654 case AArch64ISD::ADDlow: {
2655 if (!Subtarget->isTargetILP32())
2656 break;
2657 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2658 Known.Zero = APInt::getHighBitsSet(64, 32);
2659 break;
2660 }
2661 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2662 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2663 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2664 break;
2665 }
2667 Intrinsic::ID IntID =
2668 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2669 switch (IntID) {
2670 default: return;
2671 case Intrinsic::aarch64_ldaxr:
2672 case Intrinsic::aarch64_ldxr: {
2673 unsigned BitWidth = Known.getBitWidth();
2674 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2675 unsigned MemBits = VT.getScalarSizeInBits();
2676 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2677 return;
2678 }
2679 }
2680 break;
2681 }
2683 case ISD::INTRINSIC_VOID: {
2684 unsigned IntNo = Op.getConstantOperandVal(0);
2685 switch (IntNo) {
2686 default:
2687 break;
2688 case Intrinsic::aarch64_neon_uaddlv: {
2689 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2690 unsigned BitWidth = Known.getBitWidth();
2691 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2692 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2693 assert(BitWidth >= Bound && "Unexpected width!");
2695 Known.Zero |= Mask;
2696 }
2697 break;
2698 }
2699 case Intrinsic::aarch64_neon_umaxv:
2700 case Intrinsic::aarch64_neon_uminv: {
2701 // Figure out the datatype of the vector operand. The UMINV instruction
2702 // will zero extend the result, so we can mark as known zero all the
2703 // bits larger than the element datatype. 32-bit or larget doesn't need
2704 // this as those are legal types and will be handled by isel directly.
2705 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2706 unsigned BitWidth = Known.getBitWidth();
2707 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2708 assert(BitWidth >= 8 && "Unexpected width!");
2710 Known.Zero |= Mask;
2711 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2712 assert(BitWidth >= 16 && "Unexpected width!");
2714 Known.Zero |= Mask;
2715 }
2716 break;
2717 } break;
2718 }
2719 }
2720 }
2721}
2722
2724 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2725 unsigned Depth) const {
2726 EVT VT = Op.getValueType();
2727 unsigned VTBits = VT.getScalarSizeInBits();
2728 unsigned Opcode = Op.getOpcode();
2729 switch (Opcode) {
2730 case AArch64ISD::FCMEQ:
2731 case AArch64ISD::FCMGE:
2732 case AArch64ISD::FCMGT:
2733 // Compares return either 0 or all-ones
2734 return VTBits;
2735 case AArch64ISD::VASHR: {
2736 unsigned Tmp =
2737 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2738 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2739 }
2740 }
2741
2742 return 1;
2743}
2744
2746 EVT) const {
2747 return MVT::i64;
2748}
2749
2751 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2752 unsigned *Fast) const {
2753
2754 // Allow SVE loads/stores where the alignment >= the size of the element type,
2755 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2756 // for stores that come from IR, only require element-size alignment (even if
2757 // unaligned accesses are disabled). Without this, these will be forced to
2758 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2759 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2760 if (VT.isScalableVector()) {
2761 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2762 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2763 return true;
2764 }
2765
2766 if (Subtarget->requiresStrictAlign())
2767 return false;
2768
2769 if (Fast) {
2770 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2771 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2772 // See comments in performSTORECombine() for more details about
2773 // these conditions.
2774
2775 // Code that uses clang vector extensions can mark that it
2776 // wants unaligned accesses to be treated as fast by
2777 // underspecifying alignment to be 1 or 2.
2778 Alignment <= 2 ||
2779
2780 // Disregard v2i64. Memcpy lowering produces those and splitting
2781 // them regresses performance on micro-benchmarks and olden/bh.
2782 VT == MVT::v2i64;
2783 }
2784 return true;
2785}
2786
2787// Same as above but handling LLTs instead.
2789 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2790 unsigned *Fast) const {
2791 if (Subtarget->requiresStrictAlign())
2792 return false;
2793
2794 if (Fast) {
2795 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2796 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2797 Ty.getSizeInBytes() != 16 ||
2798 // See comments in performSTORECombine() for more details about
2799 // these conditions.
2800
2801 // Code that uses clang vector extensions can mark that it
2802 // wants unaligned accesses to be treated as fast by
2803 // underspecifying alignment to be 1 or 2.
2804 Alignment <= 2 ||
2805
2806 // Disregard v2i64. Memcpy lowering produces those and splitting
2807 // them regresses performance on micro-benchmarks and olden/bh.
2808 Ty == LLT::fixed_vector(2, 64);
2809 }
2810 return true;
2811}
2812
2813FastISel *
2815 const TargetLibraryInfo *libInfo) const {
2816 return AArch64::createFastISel(funcInfo, libInfo);
2817}
2818
2821 MachineBasicBlock *MBB) const {
2822 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2823 // phi node:
2824
2825 // OrigBB:
2826 // [... previous instrs leading to comparison ...]
2827 // b.ne TrueBB
2828 // b EndBB
2829 // TrueBB:
2830 // ; Fallthrough
2831 // EndBB:
2832 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2833
2834 MachineFunction *MF = MBB->getParent();
2835 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2836 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2837 DebugLoc DL = MI.getDebugLoc();
2839
2840 Register DestReg = MI.getOperand(0).getReg();
2841 Register IfTrueReg = MI.getOperand(1).getReg();
2842 Register IfFalseReg = MI.getOperand(2).getReg();
2843 unsigned CondCode = MI.getOperand(3).getImm();
2844 bool NZCVKilled = MI.getOperand(4).isKill();
2845
2846 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2847 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2848 MF->insert(It, TrueBB);
2849 MF->insert(It, EndBB);
2850
2851 // Transfer rest of current basic-block to EndBB
2852 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2853 MBB->end());
2855
2856 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2857 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2858 MBB->addSuccessor(TrueBB);
2859 MBB->addSuccessor(EndBB);
2860
2861 // TrueBB falls through to the end.
2862 TrueBB->addSuccessor(EndBB);
2863
2864 if (!NZCVKilled) {
2865 TrueBB->addLiveIn(AArch64::NZCV);
2866 EndBB->addLiveIn(AArch64::NZCV);
2867 }
2868
2869 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2870 .addReg(IfTrueReg)
2871 .addMBB(TrueBB)
2872 .addReg(IfFalseReg)
2873 .addMBB(MBB);
2874
2875 MI.eraseFromParent();
2876 return EndBB;
2877}
2878
2880 MachineInstr &MI, MachineBasicBlock *BB) const {
2882 BB->getParent()->getFunction().getPersonalityFn())) &&
2883 "SEH does not use catchret!");
2884 return BB;
2885}
2886
2889 MachineBasicBlock *MBB) const {
2890 MachineFunction &MF = *MBB->getParent();
2891 MachineBasicBlock::iterator MBBI = MI.getIterator();
2892 const AArch64InstrInfo &TII =
2893 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2894 Register TargetReg = MI.getOperand(0).getReg();
2896 TII.probedStackAlloc(MBBI, TargetReg, false);
2897
2898 MI.eraseFromParent();
2899 return NextInst->getParent();
2900}
2901
2903AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2905 MachineBasicBlock *BB) const {
2906 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2907 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2908
2909 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2910 MIB.add(MI.getOperand(1)); // slice index register
2911 MIB.add(MI.getOperand(2)); // slice index offset
2912 MIB.add(MI.getOperand(3)); // pg
2913 MIB.add(MI.getOperand(4)); // base
2914 MIB.add(MI.getOperand(5)); // offset
2915
2916 MI.eraseFromParent(); // The pseudo is gone now.
2917 return BB;
2918}
2919
2922 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2924 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2925
2926 MIB.addReg(AArch64::ZA, RegState::Define);
2927 MIB.add(MI.getOperand(0)); // Vector select register
2928 MIB.add(MI.getOperand(1)); // Vector select offset
2929 MIB.add(MI.getOperand(2)); // Base
2930 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2931
2932 MI.eraseFromParent(); // The pseudo is gone now.
2933 return BB;
2934}
2935
2938 unsigned Opcode,
2939 bool Op0IsDef) const {
2940 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2942
2943 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2944 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2945 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2946 MIB.add(MI.getOperand(I));
2947
2948 MI.eraseFromParent(); // The pseudo is gone now.
2949 return BB;
2950}
2951
2953AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2955 MachineBasicBlock *BB) const {
2956 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2957 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2958 unsigned StartIdx = 0;
2959
2960 bool HasTile = BaseReg != AArch64::ZA;
2961 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
2962 if (HasZPROut) {
2963 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
2964 ++StartIdx;
2965 }
2966 if (HasTile) {
2967 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
2968 RegState::Define); // Output ZA Tile
2969 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
2970 StartIdx++;
2971 } else {
2972 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
2973 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
2974 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
2975 ++StartIdx;
2976 }
2977 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2978 }
2979 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2980 MIB.add(MI.getOperand(I));
2981
2982 MI.eraseFromParent(); // The pseudo is gone now.
2983 return BB;
2984}
2985
2988 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2990 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2991 MIB.add(MI.getOperand(0)); // Mask
2992
2993 unsigned Mask = MI.getOperand(0).getImm();
2994 for (unsigned I = 0; I < 8; I++) {
2995 if (Mask & (1 << I))
2996 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2997 }
2998
2999 MI.eraseFromParent(); // The pseudo is gone now.
3000 return BB;
3001}
3002
3005 MachineBasicBlock *BB) const {
3006 MachineFunction *MF = BB->getParent();
3007 MachineFrameInfo &MFI = MF->getFrameInfo();
3009 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3010 if (TPIDR2.Uses > 0) {
3011 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3012 // generally don't support big-endian SVE/SME.
3013 if (!Subtarget->isLittleEndian())
3015 "TPIDR2 block initialization is not supported on big-endian targets");
3016
3017 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3018 // Store buffer pointer and num_za_save_slices.
3019 // Bytes 10-15 are implicitly zeroed.
3020 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3021 .addReg(MI.getOperand(0).getReg())
3022 .addReg(MI.getOperand(1).getReg())
3023 .addFrameIndex(TPIDR2.FrameIndex)
3024 .addImm(0);
3025 } else
3026 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3027
3028 BB->remove_instr(&MI);
3029 return BB;
3030}
3031
3034 MachineBasicBlock *BB) const {
3035 MachineFunction *MF = BB->getParent();
3036 MachineFrameInfo &MFI = MF->getFrameInfo();
3038 // TODO This function grows the stack with a subtraction, which doesn't work
3039 // on Windows. Some refactoring to share the functionality in
3040 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3041 // supports SME
3043 "Lazy ZA save is not yet supported on Windows");
3044
3045 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3046
3047 if (TPIDR2.Uses > 0) {
3048 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3050
3051 // The SUBXrs below won't always be emitted in a form that accepts SP
3052 // directly
3053 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3054 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3055 .addReg(AArch64::SP);
3056
3057 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3058 auto Size = MI.getOperand(1).getReg();
3059 auto Dest = MI.getOperand(0).getReg();
3060 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3061 .addReg(Size)
3062 .addReg(Size)
3063 .addReg(SP);
3064 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3065 AArch64::SP)
3066 .addReg(Dest);
3067
3068 // We have just allocated a variable sized object, tell this to PEI.
3069 MFI.CreateVariableSizedObject(Align(16), nullptr);
3070 }
3071
3072 BB->remove_instr(&MI);
3073 return BB;
3074}
3075
3076// TODO: Find a way to merge this with EmitAllocateZABuffer.
3079 MachineBasicBlock *BB) const {
3080 MachineFunction *MF = BB->getParent();
3081 MachineFrameInfo &MFI = MF->getFrameInfo();
3084 "Lazy ZA save is not yet supported on Windows");
3085
3086 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3087 if (FuncInfo->isSMESaveBufferUsed()) {
3088 // Allocate a buffer object of the size given by MI.getOperand(1).
3089 auto Size = MI.getOperand(1).getReg();
3090 auto Dest = MI.getOperand(0).getReg();
3091 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3092 .addReg(AArch64::SP)
3093 .addReg(Size)
3095 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3096 .addReg(AArch64::SP);
3097
3098 // We have just allocated a variable sized object, tell this to PEI.
3099 MFI.CreateVariableSizedObject(Align(16), nullptr);
3100 } else
3101 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3102 MI.getOperand(0).getReg());
3103
3104 BB->remove_instr(&MI);
3105 return BB;
3106}
3107
3110 MachineBasicBlock *BB) const {
3111 // If the buffer is used, emit a call to __arm_sme_state_size()
3112 MachineFunction *MF = BB->getParent();
3114 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3115 if (FuncInfo->isSMESaveBufferUsed()) {
3116 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3117 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3118 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3120 .addReg(AArch64::X0, RegState::ImplicitDefine)
3121 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3122 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3123 MI.getOperand(0).getReg())
3124 .addReg(AArch64::X0);
3125 } else
3126 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3127 MI.getOperand(0).getReg())
3128 .addReg(AArch64::XZR);
3129 BB->remove_instr(&MI);
3130 return BB;
3131}
3132
3135 MachineBasicBlock *BB) const {
3136 MachineFunction *MF = BB->getParent();
3138 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3139 Register ResultReg = MI.getOperand(0).getReg();
3140 if (FuncInfo->isPStateSMRegUsed()) {
3141 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3142 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3143 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3145 .addReg(AArch64::X0, RegState::ImplicitDefine)
3146 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3147 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
3148 .addReg(AArch64::X0);
3149 } else {
3150 assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
3151 "Expected no users of the entry pstate.sm!");
3152 }
3153 MI.eraseFromParent();
3154 return BB;
3155}
3156
3157// Helper function to find the instruction that defined a virtual register.
3158// If unable to find such instruction, returns nullptr.
3160 Register Reg) {
3161 while (Reg.isVirtual()) {
3162 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3163 assert(DefMI && "Virtual register definition not found");
3164 unsigned Opcode = DefMI->getOpcode();
3165
3166 if (Opcode == AArch64::COPY) {
3167 Reg = DefMI->getOperand(1).getReg();
3168 // Vreg is defined by copying from physreg.
3169 if (Reg.isPhysical())
3170 return DefMI;
3171 continue;
3172 }
3173 if (Opcode == AArch64::SUBREG_TO_REG) {
3174 Reg = DefMI->getOperand(2).getReg();
3175 continue;
3176 }
3177
3178 return DefMI;
3179 }
3180 return nullptr;
3181}
3182
3185 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3186 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3187 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3188 const DebugLoc &DL = MI.getDebugLoc();
3189
3190 Register AddrDisc = AddrDiscOp.getReg();
3191 int64_t IntDisc = IntDiscOp.getImm();
3192 assert(IntDisc == 0 && "Blend components are already expanded");
3193
3194 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3195 if (DiscMI) {
3196 switch (DiscMI->getOpcode()) {
3197 case AArch64::MOVKXi:
3198 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3199 // #imm should be an immediate and not a global symbol, for example.
3200 if (DiscMI->getOperand(2).isImm() &&
3201 DiscMI->getOperand(3).getImm() == 48) {
3202 AddrDisc = DiscMI->getOperand(1).getReg();
3203 IntDisc = DiscMI->getOperand(2).getImm();
3204 }
3205 break;
3206 case AArch64::MOVi32imm:
3207 case AArch64::MOVi64imm:
3208 // Small immediate integer constant passed via VReg.
3209 if (DiscMI->getOperand(1).isImm() &&
3210 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3211 AddrDisc = AArch64::NoRegister;
3212 IntDisc = DiscMI->getOperand(1).getImm();
3213 }
3214 break;
3215 }
3216 }
3217
3218 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3219 // in the requested register class.
3220 if (AddrDisc == AArch64::XZR)
3221 AddrDisc = AArch64::NoRegister;
3222
3223 // Make sure AddrDisc operand respects the register class imposed by MI.
3224 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3225 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3226 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3227 AddrDisc = TmpReg;
3228 }
3229
3230 AddrDiscOp.setReg(AddrDisc);
3231 IntDiscOp.setImm(IntDisc);
3232}
3233
3235 MachineInstr &MI, MachineBasicBlock *BB) const {
3236
3237 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3238 if (SMEOrigInstr != -1) {
3239 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3240 uint64_t SMEMatrixType =
3241 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3242 switch (SMEMatrixType) {
3244 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3246 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3248 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3250 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3252 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3254 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3255 }
3256 }
3257
3258 switch (MI.getOpcode()) {
3259 default:
3260#ifndef NDEBUG
3261 MI.dump();
3262#endif
3263 llvm_unreachable("Unexpected instruction for custom inserter!");
3264 case AArch64::InitTPIDR2Obj:
3265 return EmitInitTPIDR2Object(MI, BB);
3266 case AArch64::AllocateZABuffer:
3267 return EmitAllocateZABuffer(MI, BB);
3268 case AArch64::AllocateSMESaveBuffer:
3269 return EmitAllocateSMESaveBuffer(MI, BB);
3270 case AArch64::GetSMESaveSize:
3271 return EmitGetSMESaveSize(MI, BB);
3272 case AArch64::EntryPStateSM:
3273 return EmitEntryPStateSM(MI, BB);
3274 case AArch64::F128CSEL:
3275 return EmitF128CSEL(MI, BB);
3276 case TargetOpcode::STATEPOINT:
3277 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3278 // while bl call instruction (where statepoint will be lowered at the end)
3279 // has implicit def. This def is early-clobber as it will be set at
3280 // the moment of the call and earlier than any use is read.
3281 // Add this implicit dead def here as a workaround.
3282 MI.addOperand(*MI.getMF(),
3284 AArch64::LR, /*isDef*/ true,
3285 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3286 /*isUndef*/ false, /*isEarlyClobber*/ true));
3287 [[fallthrough]];
3288 case TargetOpcode::STACKMAP:
3289 case TargetOpcode::PATCHPOINT:
3290 return emitPatchPoint(MI, BB);
3291
3292 case TargetOpcode::PATCHABLE_EVENT_CALL:
3293 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3294 return BB;
3295
3296 case AArch64::CATCHRET:
3297 return EmitLoweredCatchRet(MI, BB);
3298
3299 case AArch64::PROBED_STACKALLOC_DYN:
3300 return EmitDynamicProbedAlloc(MI, BB);
3301
3302 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3303 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3304 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3305 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3306 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3307 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3308 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3309 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3310 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3311 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3312 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3313 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3314 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3315 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3316 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3317 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3318 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3319 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3320 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3321 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3322 case AArch64::LDR_ZA_PSEUDO:
3323 return EmitFill(MI, BB);
3324 case AArch64::LDR_TX_PSEUDO:
3325 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3326 case AArch64::STR_TX_PSEUDO:
3327 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3328 case AArch64::ZERO_M_PSEUDO:
3329 return EmitZero(MI, BB);
3330 case AArch64::ZERO_T_PSEUDO:
3331 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3332 case AArch64::MOVT_TIZ_PSEUDO:
3333 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3334
3335 case AArch64::PAC:
3336 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3337 &AArch64::GPR64noipRegClass);
3338 return BB;
3339 }
3340}
3341
3342//===----------------------------------------------------------------------===//
3343// AArch64 Lowering private implementation.
3344//===----------------------------------------------------------------------===//
3345
3346//===----------------------------------------------------------------------===//
3347// Lowering Code
3348//===----------------------------------------------------------------------===//
3349
3350// Forward declarations of SVE fixed length lowering helpers
3355 SelectionDAG &DAG);
3358 EVT VT);
3359
3360/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3361static bool isZerosVector(const SDNode *N) {
3362 // Look through a bit convert.
3363 while (N->getOpcode() == ISD::BITCAST)
3364 N = N->getOperand(0).getNode();
3365
3367 return true;
3368
3369 if (N->getOpcode() != AArch64ISD::DUP)
3370 return false;
3371
3372 auto Opnd0 = N->getOperand(0);
3373 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3374}
3375
3376/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3377/// CC
3379 SDValue RHS = {}) {
3380 switch (CC) {
3381 default:
3382 llvm_unreachable("Unknown condition code!");
3383 case ISD::SETNE:
3384 return AArch64CC::NE;
3385 case ISD::SETEQ:
3386 return AArch64CC::EQ;
3387 case ISD::SETGT:
3388 return AArch64CC::GT;
3389 case ISD::SETGE:
3390 return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE;
3391 case ISD::SETLT:
3392 return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT;
3393 case ISD::SETLE:
3394 return AArch64CC::LE;
3395 case ISD::SETUGT:
3396 return AArch64CC::HI;
3397 case ISD::SETUGE:
3398 return AArch64CC::HS;
3399 case ISD::SETULT:
3400 return AArch64CC::LO;
3401 case ISD::SETULE:
3402 return AArch64CC::LS;
3403 }
3404}
3405
3406/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3408 AArch64CC::CondCode &CondCode,
3409 AArch64CC::CondCode &CondCode2) {
3410 CondCode2 = AArch64CC::AL;
3411 switch (CC) {
3412 default:
3413 llvm_unreachable("Unknown FP condition!");
3414 case ISD::SETEQ:
3415 case ISD::SETOEQ:
3416 CondCode = AArch64CC::EQ;
3417 break;
3418 case ISD::SETGT:
3419 case ISD::SETOGT:
3420 CondCode = AArch64CC::GT;
3421 break;
3422 case ISD::SETGE:
3423 case ISD::SETOGE:
3424 CondCode = AArch64CC::GE;
3425 break;
3426 case ISD::SETOLT:
3427 CondCode = AArch64CC::MI;
3428 break;
3429 case ISD::SETOLE:
3430 CondCode = AArch64CC::LS;
3431 break;
3432 case ISD::SETONE:
3433 CondCode = AArch64CC::MI;
3434 CondCode2 = AArch64CC::GT;
3435 break;
3436 case ISD::SETO:
3437 CondCode = AArch64CC::VC;
3438 break;
3439 case ISD::SETUO:
3440 CondCode = AArch64CC::VS;
3441 break;
3442 case ISD::SETUEQ:
3443 CondCode = AArch64CC::EQ;
3444 CondCode2 = AArch64CC::VS;
3445 break;
3446 case ISD::SETUGT:
3447 CondCode = AArch64CC::HI;
3448 break;
3449 case ISD::SETUGE:
3450 CondCode = AArch64CC::PL;
3451 break;
3452 case ISD::SETLT:
3453 case ISD::SETULT:
3454 CondCode = AArch64CC::LT;
3455 break;
3456 case ISD::SETLE:
3457 case ISD::SETULE:
3458 CondCode = AArch64CC::LE;
3459 break;
3460 case ISD::SETNE:
3461 case ISD::SETUNE:
3462 CondCode = AArch64CC::NE;
3463 break;
3464 }
3465}
3466
3467/// Convert a DAG fp condition code to an AArch64 CC.
3468/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3469/// should be AND'ed instead of OR'ed.
3471 AArch64CC::CondCode &CondCode,
3472 AArch64CC::CondCode &CondCode2) {
3473 CondCode2 = AArch64CC::AL;
3474 switch (CC) {
3475 default:
3476 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3477 assert(CondCode2 == AArch64CC::AL);
3478 break;
3479 case ISD::SETONE:
3480 // (a one b)
3481 // == ((a olt b) || (a ogt b))
3482 // == ((a ord b) && (a une b))
3483 CondCode = AArch64CC::VC;
3484 CondCode2 = AArch64CC::NE;
3485 break;
3486 case ISD::SETUEQ:
3487 // (a ueq b)
3488 // == ((a uno b) || (a oeq b))
3489 // == ((a ule b) && (a uge b))
3490 CondCode = AArch64CC::PL;
3491 CondCode2 = AArch64CC::LE;
3492 break;
3493 }
3494}
3495
3496/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3497/// CC usable with the vector instructions. Fewer operations are available
3498/// without a real NZCV register, so we have to use less efficient combinations
3499/// to get the same effect.
3501 AArch64CC::CondCode &CondCode,
3502 AArch64CC::CondCode &CondCode2,
3503 bool &Invert) {
3504 Invert = false;
3505 switch (CC) {
3506 default:
3507 // Mostly the scalar mappings work fine.
3508 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3509 break;
3510 case ISD::SETUO:
3511 Invert = true;
3512 [[fallthrough]];
3513 case ISD::SETO:
3514 CondCode = AArch64CC::MI;
3515 CondCode2 = AArch64CC::GE;
3516 break;
3517 case ISD::SETUEQ:
3518 case ISD::SETULT:
3519 case ISD::SETULE:
3520 case ISD::SETUGT:
3521 case ISD::SETUGE:
3522 // All of the compare-mask comparisons are ordered, but we can switch
3523 // between the two by a double inversion. E.g. ULE == !OGT.
3524 Invert = true;
3525 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3526 CondCode, CondCode2);
3527 break;
3528 }
3529}
3530
3531/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3533 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3534 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3535}
3536
3538 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3539 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3540 LLVM_DEBUG(dbgs() << "Is imm " << C
3541 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3542 return IsLegal;
3543}
3544
3546 // Works for negative immediates too, as it can be written as an ADDS
3547 // instruction with a negated immediate.
3548 return isLegalArithImmed(C.abs().getZExtValue());
3549}
3550
3552 uint64_t Imm = C.getZExtValue();
3554 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3555 return Insn.size();
3556}
3557
3559 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3560 if (Op->getFlags().hasNoSignedWrap())
3561 return true;
3562
3563 // We can still figure out if the second operand is safe to use
3564 // in a CMN instruction by checking if it is known to be not the minimum
3565 // signed value. If it is not, then we can safely use CMN.
3566 // Note: We can eventually remove this check and simply rely on
3567 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3568 // consistently sets them appropriately when making said nodes.
3569
3570 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3571 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3572}
3573
3574// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3575// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3576// can be set differently by this operation. It comes down to whether
3577// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3578// everything is fine. If not then the optimization is wrong. Thus general
3579// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3580//
3581// So, finally, the only LLVM-native comparisons that don't mention C or V
3582// are the ones that aren't unsigned comparisons. They're the only ones we can
3583// safely use CMN for in the absence of information about op2.
3585 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3586 (isIntEqualitySetCC(CC) ||
3587 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3588 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3589}
3590
3592 SelectionDAG &DAG, SDValue Chain,
3593 bool IsSignaling) {
3594 EVT VT = LHS.getValueType();
3595 assert(VT != MVT::f128);
3596
3597 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3598
3599 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3600 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3601 {Chain, LHS});
3602 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3603 {LHS.getValue(1), RHS});
3604 Chain = RHS.getValue(1);
3605 }
3606 unsigned Opcode =
3607 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3608 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3609}
3610
3612 const SDLoc &DL, SelectionDAG &DAG) {
3613 EVT VT = LHS.getValueType();
3614 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3615
3616 if (VT.isFloatingPoint()) {
3617 assert(VT != MVT::f128);
3618 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3619 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3620 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3621 }
3622 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3623 }
3624
3625 // The CMP instruction is just an alias for SUBS, and representing it as
3626 // SUBS means that it's possible to get CSE with subtract operations.
3627 // A later phase can perform the optimization of setting the destination
3628 // register to WZR/XZR if it ends up being unused.
3629 unsigned Opcode = AArch64ISD::SUBS;
3630
3631 if (isCMN(RHS, CC, DAG)) {
3632 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3633 Opcode = AArch64ISD::ADDS;
3634 RHS = RHS.getOperand(1);
3635 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3636 isIntEqualitySetCC(CC)) {
3637 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3638 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3639 Opcode = AArch64ISD::ADDS;
3640 LHS = LHS.getOperand(1);
3641 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3642 if (LHS.getOpcode() == ISD::AND) {
3643 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3644 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3645 // of the signed comparisons.
3646 const SDValue ANDSNode =
3647 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3648 LHS.getOperand(0), LHS.getOperand(1));
3649 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3650 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3651 return ANDSNode.getValue(1);
3652 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3653 // Use result of ANDS
3654 return LHS.getValue(1);
3655 }
3656 }
3657
3658 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3659 .getValue(1);
3660}
3661
3662/// \defgroup AArch64CCMP CMP;CCMP matching
3663///
3664/// These functions deal with the formation of CMP;CCMP;... sequences.
3665/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3666/// a comparison. They set the NZCV flags to a predefined value if their
3667/// predicate is false. This allows to express arbitrary conjunctions, for
3668/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3669/// expressed as:
3670/// cmp A
3671/// ccmp B, inv(CB), CA
3672/// check for CB flags
3673///
3674/// This naturally lets us implement chains of AND operations with SETCC
3675/// operands. And we can even implement some other situations by transforming
3676/// them:
3677/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3678/// negating the flags used in a CCMP/FCCMP operations.
3679/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3680/// by negating the flags we test for afterwards. i.e.
3681/// NEG (CMP CCMP CCCMP ...) can be implemented.
3682/// - Note that we can only ever negate all previously processed results.
3683/// What we can not implement by flipping the flags to test is a negation
3684/// of two sub-trees (because the negation affects all sub-trees emitted so
3685/// far, so the 2nd sub-tree we emit would also affect the first).
3686/// With those tools we can implement some OR operations:
3687/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3688/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3689/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3690/// elimination rules from earlier to implement the whole thing as a
3691/// CCMP/FCCMP chain.
3692///
3693/// As complete example:
3694/// or (or (setCA (cmp A)) (setCB (cmp B)))
3695/// (and (setCC (cmp C)) (setCD (cmp D)))"
3696/// can be reassociated to:
3697/// or (and (setCC (cmp C)) setCD (cmp D))
3698// (or (setCA (cmp A)) (setCB (cmp B)))
3699/// can be transformed to:
3700/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3701/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3702/// which can be implemented as:
3703/// cmp C
3704/// ccmp D, inv(CD), CC
3705/// ccmp A, CA, inv(CD)
3706/// ccmp B, CB, inv(CA)
3707/// check for CB flags
3708///
3709/// A counterexample is "or (and A B) (and C D)" which translates to
3710/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3711/// can only implement 1 of the inner (not) operations, but not both!
3712/// @{
3713
3714/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3716 ISD::CondCode CC, SDValue CCOp,
3718 AArch64CC::CondCode OutCC,
3719 const SDLoc &DL, SelectionDAG &DAG) {
3720 unsigned Opcode = 0;
3721 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3722
3723 if (LHS.getValueType().isFloatingPoint()) {
3724 assert(LHS.getValueType() != MVT::f128);
3725 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3726 LHS.getValueType() == MVT::bf16) {
3727 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3728 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3729 }
3730 Opcode = AArch64ISD::FCCMP;
3731 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3732 APInt Imm = Const->getAPIntValue();
3733 if (Imm.isNegative() && Imm.sgt(-32)) {
3734 Opcode = AArch64ISD::CCMN;
3735 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3736 }
3737 } else if (isCMN(RHS, CC, DAG)) {
3738 Opcode = AArch64ISD::CCMN;
3739 RHS = RHS.getOperand(1);
3740 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3741 isIntEqualitySetCC(CC)) {
3742 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3743 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3744 Opcode = AArch64ISD::CCMN;
3745 LHS = LHS.getOperand(1);
3746 }
3747 if (Opcode == 0)
3748 Opcode = AArch64ISD::CCMP;
3749
3750 SDValue Condition = getCondCode(DAG, Predicate);
3752 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3753 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3754 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3755}
3756
3757/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3758/// expressed as a conjunction. See \ref AArch64CCMP.
3759/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3760/// changing the conditions on the SETCC tests.
3761/// (this means we can call emitConjunctionRec() with
3762/// Negate==true on this sub-tree)
3763/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3764/// cannot do the negation naturally. We are required to
3765/// emit the subtree first in this case.
3766/// \param WillNegate Is true if are called when the result of this
3767/// subexpression must be negated. This happens when the
3768/// outer expression is an OR. We can use this fact to know
3769/// that we have a double negation (or (or ...) ...) that
3770/// can be implemented for free.
3771static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3772 bool &MustBeFirst, bool WillNegate,
3773 unsigned Depth = 0) {
3774 if (!Val.hasOneUse())
3775 return false;
3776 unsigned Opcode = Val->getOpcode();
3777 if (Opcode == ISD::SETCC) {
3778 if (Val->getOperand(0).getValueType() == MVT::f128)
3779 return false;
3780 CanNegate = true;
3781 MustBeFirst = false;
3782 return true;
3783 }
3784 // Protect against exponential runtime and stack overflow.
3785 if (Depth > 6)
3786 return false;
3787 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3788 bool IsOR = Opcode == ISD::OR;
3789 SDValue O0 = Val->getOperand(0);
3790 SDValue O1 = Val->getOperand(1);
3791 bool CanNegateL;
3792 bool MustBeFirstL;
3793 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3794 return false;
3795 bool CanNegateR;
3796 bool MustBeFirstR;
3797 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3798 return false;
3799
3800 if (MustBeFirstL && MustBeFirstR)
3801 return false;
3802
3803 if (IsOR) {
3804 // For an OR expression we need to be able to naturally negate at least
3805 // one side or we cannot do the transformation at all.
3806 if (!CanNegateL && !CanNegateR)
3807 return false;
3808 // If we the result of the OR will be negated and we can naturally negate
3809 // the leafs, then this sub-tree as a whole negates naturally.
3810 CanNegate = WillNegate && CanNegateL && CanNegateR;
3811 // If we cannot naturally negate the whole sub-tree, then this must be
3812 // emitted first.
3813 MustBeFirst = !CanNegate;
3814 } else {
3815 assert(Opcode == ISD::AND && "Must be OR or AND");
3816 // We cannot naturally negate an AND operation.
3817 CanNegate = false;
3818 MustBeFirst = MustBeFirstL || MustBeFirstR;
3819 }
3820 return true;
3821 }
3822 return false;
3823}
3824
3825/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3826/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3827/// Tries to transform the given i1 producing node @p Val to a series compare
3828/// and conditional compare operations. @returns an NZCV flags producing node
3829/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3830/// transformation was not possible.
3831/// \p Negate is true if we want this sub-tree being negated just by changing
3832/// SETCC conditions.
3834 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3836 // We're at a tree leaf, produce a conditional comparison operation.
3837 unsigned Opcode = Val->getOpcode();
3838 if (Opcode == ISD::SETCC) {
3839 SDValue LHS = Val->getOperand(0);
3840 SDValue RHS = Val->getOperand(1);
3841 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3842 bool isInteger = LHS.getValueType().isInteger();
3843 if (Negate)
3844 CC = getSetCCInverse(CC, LHS.getValueType());
3845 SDLoc DL(Val);
3846 // Determine OutCC and handle FP special case.
3847 if (isInteger) {
3848 OutCC = changeIntCCToAArch64CC(CC, RHS);
3849 } else {
3850 assert(LHS.getValueType().isFloatingPoint());
3851 AArch64CC::CondCode ExtraCC;
3852 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3853 // Some floating point conditions can't be tested with a single condition
3854 // code. Construct an additional comparison in this case.
3855 if (ExtraCC != AArch64CC::AL) {
3856 SDValue ExtraCmp;
3857 if (!CCOp.getNode())
3858 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3859 else
3860 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3861 ExtraCC, DL, DAG);
3862 CCOp = ExtraCmp;
3863 Predicate = ExtraCC;
3864 }
3865 }
3866
3867 // Produce a normal comparison if we are first in the chain
3868 if (!CCOp)
3869 return emitComparison(LHS, RHS, CC, DL, DAG);
3870 // Otherwise produce a ccmp.
3871 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3872 DAG);
3873 }
3874 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3875
3876 bool IsOR = Opcode == ISD::OR;
3877
3878 SDValue LHS = Val->getOperand(0);
3879 bool CanNegateL;
3880 bool MustBeFirstL;
3881 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3882 assert(ValidL && "Valid conjunction/disjunction tree");
3883 (void)ValidL;
3884
3885 SDValue RHS = Val->getOperand(1);
3886 bool CanNegateR;
3887 bool MustBeFirstR;
3888 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3889 assert(ValidR && "Valid conjunction/disjunction tree");
3890 (void)ValidR;
3891
3892 // Swap sub-tree that must come first to the right side.
3893 if (MustBeFirstL) {
3894 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3895 std::swap(LHS, RHS);
3896 std::swap(CanNegateL, CanNegateR);
3897 std::swap(MustBeFirstL, MustBeFirstR);
3898 }
3899
3900 bool NegateR;
3901 bool NegateAfterR;
3902 bool NegateL;
3903 bool NegateAfterAll;
3904 if (Opcode == ISD::OR) {
3905 // Swap the sub-tree that we can negate naturally to the left.
3906 if (!CanNegateL) {
3907 assert(CanNegateR && "at least one side must be negatable");
3908 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3909 assert(!Negate);
3910 std::swap(LHS, RHS);
3911 NegateR = false;
3912 NegateAfterR = true;
3913 } else {
3914 // Negate the left sub-tree if possible, otherwise negate the result.
3915 NegateR = CanNegateR;
3916 NegateAfterR = !CanNegateR;
3917 }
3918 NegateL = true;
3919 NegateAfterAll = !Negate;
3920 } else {
3921 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3922 assert(!Negate && "Valid conjunction/disjunction tree");
3923
3924 NegateL = false;
3925 NegateR = false;
3926 NegateAfterR = false;
3927 NegateAfterAll = false;
3928 }
3929
3930 // Emit sub-trees.
3931 AArch64CC::CondCode RHSCC;
3932 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3933 if (NegateAfterR)
3934 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3935 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3936 if (NegateAfterAll)
3937 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3938 return CmpL;
3939}
3940
3941/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3942/// In some cases this is even possible with OR operations in the expression.
3943/// See \ref AArch64CCMP.
3944/// \see emitConjunctionRec().
3946 AArch64CC::CondCode &OutCC) {
3947 bool DummyCanNegate;
3948 bool DummyMustBeFirst;
3949 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3950 return SDValue();
3951
3952 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3953}
3954
3955/// @}
3956
3957/// Returns how profitable it is to fold a comparison's operand's shift and/or
3958/// extension operations.
3960 auto isSupportedExtend = [&](SDValue V) {
3961 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3962 return true;
3963
3964 if (V.getOpcode() == ISD::AND)
3965 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3966 uint64_t Mask = MaskCst->getZExtValue();
3967 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3968 }
3969
3970 return false;
3971 };
3972
3973 if (!Op.hasOneUse())
3974 return 0;
3975
3976 if (isSupportedExtend(Op))
3977 return 1;
3978
3979 unsigned Opc = Op.getOpcode();
3980 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3981 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3982 uint64_t Shift = ShiftCst->getZExtValue();
3983 if (isSupportedExtend(Op.getOperand(0)))
3984 return (Shift <= 4) ? 2 : 1;
3985 EVT VT = Op.getValueType();
3986 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3987 return 1;
3988 }
3989
3990 return 0;
3991}
3992
3993// emitComparison() converts comparison with one or negative one to comparison
3994// with 0. Note that this only works for signed comparisons because of how ANDS
3995// works.
3997 // Only works for ANDS and AND.
3998 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
3999 return false;
4000
4001 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4002 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4003 return true;
4004 }
4005
4006 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4007 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4008 return true;
4009 }
4010
4011 return false;
4012}
4013
4015 SDValue &AArch64cc, SelectionDAG &DAG,
4016 const SDLoc &DL) {
4017 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4018 EVT VT = RHS.getValueType();
4019 APInt C = RHSC->getAPIntValue();
4020 // shouldBeAdjustedToZero is a special case to better fold with
4021 // emitComparison().
4022 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4023 // Adjust the constant to zero.
4024 // CC has already been adjusted.
4025 RHS = DAG.getConstant(0, DL, VT);
4026 } else if (!isLegalCmpImmed(C)) {
4027 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4028 // Constant does not fit, try adjusting it by one?
4029 switch (CC) {
4030 default:
4031 break;
4032 case ISD::SETLT:
4033 case ISD::SETGE:
4034 if (!C.isMinSignedValue()) {
4035 APInt CMinusOne = C - 1;
4036 if (isLegalCmpImmed(CMinusOne) ||
4037 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4038 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4039 RHS = DAG.getConstant(CMinusOne, DL, VT);
4040 }
4041 }
4042 break;
4043 case ISD::SETULT:
4044 case ISD::SETUGE: {
4045 // C is not 0 because it is a legal immediate.
4046 assert(!C.isZero() && "C should not be zero here");
4047 APInt CMinusOne = C - 1;
4048 if (isLegalCmpImmed(CMinusOne) ||
4049 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4050 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4051 RHS = DAG.getConstant(CMinusOne, DL, VT);
4052 }
4053 break;
4054 }
4055 case ISD::SETLE:
4056 case ISD::SETGT:
4057 if (!C.isMaxSignedValue()) {
4058 APInt CPlusOne = C + 1;
4059 if (isLegalCmpImmed(CPlusOne) ||
4060 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4061 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4062 RHS = DAG.getConstant(CPlusOne, DL, VT);
4063 }
4064 }
4065 break;
4066 case ISD::SETULE:
4067 case ISD::SETUGT: {
4068 if (!C.isAllOnes()) {
4069 APInt CPlusOne = C + 1;
4070 if (isLegalCmpImmed(CPlusOne) ||
4071 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4072 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4073 RHS = DAG.getConstant(CPlusOne, DL, VT);
4074 }
4075 }
4076 break;
4077 }
4078 }
4079 }
4080 }
4081
4082 // Comparisons are canonicalized so that the RHS operand is simpler than the
4083 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4084 // can fold some shift+extend operations on the RHS operand, so swap the
4085 // operands if that can be done.
4086 //
4087 // For example:
4088 // lsl w13, w11, #1
4089 // cmp w13, w12
4090 // can be turned into:
4091 // cmp w12, w11, lsl #1
4092 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4093 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4094 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4095 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4096 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4097
4098 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4099 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4100 std::swap(LHS, RHS);
4102 }
4103 }
4104
4105 SDValue Cmp;
4106 AArch64CC::CondCode AArch64CC;
4107 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4108 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4109
4110 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4111 // For the i8 operand, the largest immediate is 255, so this can be easily
4112 // encoded in the compare instruction. For the i16 operand, however, the
4113 // largest immediate cannot be encoded in the compare.
4114 // Therefore, use a sign extending load and cmn to avoid materializing the
4115 // -1 constant. For example,
4116 // movz w1, #65535
4117 // ldrh w0, [x0, #0]
4118 // cmp w0, w1
4119 // >
4120 // ldrsh w0, [x0, #0]
4121 // cmn w0, #1
4122 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4123 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4124 // ensure both the LHS and RHS are truly zero extended and to make sure the
4125 // transformation is profitable.
4126 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4127 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4128 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4129 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4130 int16_t ValueofRHS = RHS->getAsZExtVal();
4131 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4132 SDValue SExt =
4133 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4134 DAG.getValueType(MVT::i16));
4135 Cmp = emitComparison(
4136 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4137 DL, DAG);
4138 AArch64CC = changeIntCCToAArch64CC(CC);
4139 }
4140 }
4141
4142 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4143 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4144 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4145 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4146 }
4147 }
4148 }
4149
4150 if (!Cmp) {
4151 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4152 AArch64CC = changeIntCCToAArch64CC(CC, RHS);
4153 }
4154 AArch64cc = getCondCode(DAG, AArch64CC);
4155 return Cmp;
4156}
4157
4158static std::pair<SDValue, SDValue>
4160 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4161 "Unsupported value type");
4162 SDValue Value, Overflow;
4163 SDLoc DL(Op);
4164 SDValue LHS = Op.getOperand(0);
4165 SDValue RHS = Op.getOperand(1);
4166 unsigned Opc = 0;
4167 switch (Op.getOpcode()) {
4168 default:
4169 llvm_unreachable("Unknown overflow instruction!");
4170 case ISD::SADDO:
4171 Opc = AArch64ISD::ADDS;
4172 CC = AArch64CC::VS;
4173 break;
4174 case ISD::UADDO:
4175 Opc = AArch64ISD::ADDS;
4176 CC = AArch64CC::HS;
4177 break;
4178 case ISD::SSUBO:
4179 Opc = AArch64ISD::SUBS;
4180 CC = AArch64CC::VS;
4181 break;
4182 case ISD::USUBO:
4183 Opc = AArch64ISD::SUBS;
4184 CC = AArch64CC::LO;
4185 break;
4186 // Multiply needs a little bit extra work.
4187 case ISD::SMULO:
4188 case ISD::UMULO: {
4189 CC = AArch64CC::NE;
4190 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4191 if (Op.getValueType() == MVT::i32) {
4192 // Extend to 64-bits, then perform a 64-bit multiply.
4193 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4194 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4195 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4196 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4197 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4198
4199 // Check that the result fits into a 32-bit integer.
4200 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4201 if (IsSigned) {
4202 // cmp xreg, wreg, sxtw
4203 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4204 Overflow =
4205 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4206 } else {
4207 // tst xreg, #0xffffffff00000000
4208 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4209 Overflow =
4210 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4211 }
4212 break;
4213 }
4214 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4215 // For the 64 bit multiply
4216 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4217 if (IsSigned) {
4218 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4219 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4220 DAG.getConstant(63, DL, MVT::i64));
4221 // It is important that LowerBits is last, otherwise the arithmetic
4222 // shift will not be folded into the compare (SUBS).
4223 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4224 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4225 .getValue(1);
4226 } else {
4227 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4228 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4229 Overflow =
4230 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4231 DAG.getConstant(0, DL, MVT::i64),
4232 UpperBits).getValue(1);
4233 }
4234 break;
4235 }
4236 } // switch (...)
4237
4238 if (Opc) {
4239 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4240
4241 // Emit the AArch64 operation with overflow check.
4242 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4243 Overflow = Value.getValue(1);
4244 }
4245 return std::make_pair(Value, Overflow);
4246}
4247
4248SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4249 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4250 !Subtarget->isNeonAvailable()))
4251 return LowerToScalableOp(Op, DAG);
4252
4253 SDValue Sel = Op.getOperand(0);
4254 SDValue Other = Op.getOperand(1);
4255 SDLoc DL(Sel);
4256
4257 // If the operand is an overflow checking operation, invert the condition
4258 // code and kill the Not operation. I.e., transform:
4259 // (xor (overflow_op_bool, 1))
4260 // -->
4261 // (csel 1, 0, invert(cc), overflow_op_bool)
4262 // ... which later gets transformed to just a cset instruction with an
4263 // inverted condition code, rather than a cset + eor sequence.
4265 // Only lower legal XALUO ops.
4267 return SDValue();
4268
4269 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4270 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4272 SDValue Value, Overflow;
4273 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4274 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4275 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4276 CCVal, Overflow);
4277 }
4278 // If neither operand is a SELECT_CC, give up.
4279 if (Sel.getOpcode() != ISD::SELECT_CC)
4280 std::swap(Sel, Other);
4281 if (Sel.getOpcode() != ISD::SELECT_CC)
4282 return Op;
4283
4284 // The folding we want to perform is:
4285 // (xor x, (select_cc a, b, cc, 0, -1) )
4286 // -->
4287 // (csel x, (xor x, -1), cc ...)
4288 //
4289 // The latter will get matched to a CSINV instruction.
4290
4291 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4292 SDValue LHS = Sel.getOperand(0);
4293 SDValue RHS = Sel.getOperand(1);
4294 SDValue TVal = Sel.getOperand(2);
4295 SDValue FVal = Sel.getOperand(3);
4296
4297 // FIXME: This could be generalized to non-integer comparisons.
4298 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4299 return Op;
4300
4301 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4302 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4303
4304 // The values aren't constants, this isn't the pattern we're looking for.
4305 if (!CFVal || !CTVal)
4306 return Op;
4307
4308 // We can commute the SELECT_CC by inverting the condition. This
4309 // might be needed to make this fit into a CSINV pattern.
4310 if (CTVal->isAllOnes() && CFVal->isZero()) {
4311 std::swap(TVal, FVal);
4312 std::swap(CTVal, CFVal);
4313 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4314 }
4315
4316 // If the constants line up, perform the transform!
4317 if (CTVal->isZero() && CFVal->isAllOnes()) {
4318 SDValue CCVal;
4319 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4320
4321 FVal = Other;
4322 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4323 DAG.getAllOnesConstant(DL, Other.getValueType()));
4324
4325 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4326 CCVal, Cmp);
4327 }
4328
4329 return Op;
4330}
4331
4332// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4333// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4334// sets 'C' bit to 0.
4336 SDLoc DL(Value);
4337 EVT VT = Value.getValueType();
4338 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4339 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4340 SDValue Cmp =
4341 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4342 return Cmp.getValue(1);
4343}
4344
4345// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4346// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4348 bool Invert) {
4349 assert(Glue.getResNo() == 1);
4350 SDLoc DL(Glue);
4351 SDValue Zero = DAG.getConstant(0, DL, VT);
4352 SDValue One = DAG.getConstant(1, DL, VT);
4354 SDValue CC = getCondCode(DAG, Cond);
4355 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4356}
4357
4358// Value is 1 if 'V' bit of NZCV is 1, else 0
4360 assert(Glue.getResNo() == 1);
4361 SDLoc DL(Glue);
4362 SDValue Zero = DAG.getConstant(0, DL, VT);
4363 SDValue One = DAG.getConstant(1, DL, VT);
4365 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4366}
4367
4368// This lowering is inefficient, but it will get cleaned up by
4369// `foldOverflowCheck`
4371 unsigned Opcode, bool IsSigned) {
4372 EVT VT0 = Op.getValue(0).getValueType();
4373 EVT VT1 = Op.getValue(1).getValueType();
4374
4375 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4376 return SDValue();
4377
4378 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4379 SDValue OpLHS = Op.getOperand(0);
4380 SDValue OpRHS = Op.getOperand(1);
4381 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4382
4383 SDLoc DL(Op);
4384
4385 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4386 OpRHS, OpCarryIn);
4387
4388 SDValue OutFlag =
4389 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4390 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4391
4392 return DAG.getMergeValues({Sum, OutFlag}, DL);
4393}
4394
4396 // Let legalize expand this if it isn't a legal type yet.
4397 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4398 return SDValue();
4399
4400 SDLoc DL(Op);
4402 // The actual operation that sets the overflow or carry flag.
4403 SDValue Value, Overflow;
4404 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4405
4406 // We use 0 and 1 as false and true values.
4407 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4408 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4409
4410 // We use an inverted condition, because the conditional select is inverted
4411 // too. This will allow it to be selected to a single instruction:
4412 // CSINC Wd, WZR, WZR, invert(cond).
4413 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4414 Overflow =
4415 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4416
4417 return DAG.getMergeValues({Value, Overflow}, DL);
4418}
4419
4420// Prefetch operands are:
4421// 1: Address to prefetch
4422// 2: bool isWrite
4423// 3: int locality (0 = no locality ... 3 = extreme locality)
4424// 4: bool isDataCache
4426 SDLoc DL(Op);
4427 unsigned IsWrite = Op.getConstantOperandVal(2);
4428 unsigned Locality = Op.getConstantOperandVal(3);
4429 unsigned IsData = Op.getConstantOperandVal(4);
4430
4431 bool IsStream = !Locality;
4432 // When the locality number is set
4433 if (Locality) {
4434 // The front-end should have filtered out the out-of-range values
4435 assert(Locality <= 3 && "Prefetch locality out-of-range");
4436 // The locality degree is the opposite of the cache speed.
4437 // Put the number the other way around.
4438 // The encoding starts at 0 for level 1
4439 Locality = 3 - Locality;
4440 }
4441
4442 // built the mask value encoding the expected behavior.
4443 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4444 (!IsData << 3) | // IsDataCache bit
4445 (Locality << 1) | // Cache level bits
4446 (unsigned)IsStream; // Stream bit
4447 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4448 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4449 Op.getOperand(1));
4450}
4451
4452// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4453// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4454// (AND X Y) Z which produces a better opt with EmitComparison
4456 SelectionDAG &DAG, const SDLoc DL) {
4457 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4458 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4459 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4460 if (LHSConstOp && RHSConst) {
4461 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4462 uint64_t RHSConstant = RHSConst->getZExtValue();
4463 if (isPowerOf2_64(RHSConstant)) {
4464 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4465 LHS =
4466 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4467 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4468 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4469 CC = ISD::SETEQ;
4470 }
4471 }
4472 }
4473}
4474
4475SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4476 SelectionDAG &DAG) const {
4477 EVT VT = Op.getValueType();
4478 if (VT.isScalableVector()) {
4479 SDValue SrcVal = Op.getOperand(0);
4480
4481 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4482 // Break conversion in two with the first part converting to f32 and the
4483 // second using native f32->VT instructions.
4484 SDLoc DL(Op);
4485 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4486 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4487 }
4488
4489 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4490 }
4491
4492 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4493 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4494
4495 bool IsStrict = Op->isStrictFPOpcode();
4496 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4497 EVT Op0VT = Op0.getValueType();
4498 if (VT == MVT::f64) {
4499 // FP16->FP32 extends are legal for v32 and v4f32.
4500 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4501 return Op;
4502 // Split bf16->f64 extends into two fpextends.
4503 if (Op0VT == MVT::bf16 && IsStrict) {
4504 SDValue Ext1 =
4505 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4506 {Op0, Op.getOperand(0)});
4507 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4508 {Ext1, Ext1.getValue(1)});
4509 }
4510 if (Op0VT == MVT::bf16)
4511 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4512 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4513 return SDValue();
4514 }
4515
4516 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4517 return SDValue();
4518}
4519
4520SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4521 SelectionDAG &DAG) const {
4522 EVT VT = Op.getValueType();
4523 bool IsStrict = Op->isStrictFPOpcode();
4524 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4525 EVT SrcVT = SrcVal.getValueType();
4526 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4527
4528 if (VT.isScalableVector()) {
4529 // Let common code split the operation.
4530 if (SrcVT == MVT::nxv8f32)
4531 return Op;
4532
4533 if (VT.getScalarType() != MVT::bf16)
4534 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4535
4536 SDLoc DL(Op);
4537 constexpr EVT I32 = MVT::nxv4i32;
4538 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4539
4540 SDValue NaN;
4541 SDValue Narrow;
4542
4543 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4544 if (Subtarget->hasBF16())
4545 return LowerToPredicatedOp(Op, DAG,
4546 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4547
4548 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4549
4550 // Set the quiet bit.
4551 if (!DAG.isKnownNeverSNaN(SrcVal))
4552 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4553 } else if (SrcVT == MVT::nxv2f64 &&
4554 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4555 // Round to float without introducing rounding errors and try again.
4556 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4557 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4558 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4559
4561 if (IsStrict)
4562 NewOps.push_back(Op.getOperand(0));
4563 NewOps.push_back(Narrow);
4564 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4565 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4566 } else
4567 return SDValue();
4568
4569 if (!Trunc) {
4570 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4571 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4572 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4573 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4574 }
4575
4576 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4577 // 0x80000000.
4578 if (NaN) {
4579 EVT I1 = I32.changeElementType(MVT::i1);
4580 EVT CondVT = VT.changeElementType(MVT::i1);
4581 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4582 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4583 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4584 }
4585
4586 // Now that we have rounded, shift the bits into position.
4587 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4588 return getSVESafeBitCast(VT, Narrow, DAG);
4589 }
4590
4591 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4592 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4593
4594 // Expand cases where the result type is BF16 but we don't have hardware
4595 // instructions to lower it.
4596 if (VT.getScalarType() == MVT::bf16 &&
4597 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4598 Subtarget->hasBF16())) {
4599 SDLoc DL(Op);
4600 SDValue Narrow = SrcVal;
4601 SDValue NaN;
4602 EVT I32 = SrcVT.changeElementType(MVT::i32);
4603 EVT F32 = SrcVT.changeElementType(MVT::f32);
4604 if (SrcVT.getScalarType() == MVT::f32) {
4605 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4606 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4607 if (!NeverSNaN) {
4608 // Set the quiet bit.
4609 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4610 DAG.getConstant(0x400000, DL, I32));
4611 }
4612 } else if (SrcVT.getScalarType() == MVT::f64) {
4613 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4614 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4615 } else {
4616 return SDValue();
4617 }
4618 if (!Trunc) {
4619 SDValue One = DAG.getConstant(1, DL, I32);
4620 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4621 DAG.getShiftAmountConstant(16, I32, DL));
4622 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4623 SDValue RoundingBias =
4624 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4625 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4626 }
4627
4628 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4629 // 0x80000000.
4630 if (NaN) {
4631 SDValue IsNaN = DAG.getSetCC(
4632 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4633 SrcVal, SrcVal, ISD::SETUO);
4634 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4635 }
4636
4637 // Now that we have rounded, shift the bits into position.
4638 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4639 DAG.getShiftAmountConstant(16, I32, DL));
4640 if (VT.isVector()) {
4641 EVT I16 = I32.changeVectorElementType(MVT::i16);
4642 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4643 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4644 }
4645 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4646 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4647 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4648 : Result;
4649 }
4650
4651 if (SrcVT != MVT::f128) {
4652 // Expand cases where the input is a vector bigger than NEON.
4654 return SDValue();
4655
4656 // It's legal except when f128 is involved
4657 return Op;
4658 }
4659
4660 return SDValue();
4661}
4662
4663SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4664 SelectionDAG &DAG) const {
4665 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4666 // Any additional optimization in this function should be recorded
4667 // in the cost tables.
4668 bool IsStrict = Op->isStrictFPOpcode();
4669 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4670 EVT VT = Op.getValueType();
4671
4672 assert(!(IsStrict && VT.isScalableVector()) &&
4673 "Unimplemented SVE support for STRICT_FP_to_INT!");
4674
4675 // f16 conversions are promoted to f32 when full fp16 is not supported.
4676 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4677 InVT.getVectorElementType() == MVT::bf16) {
4678 EVT NewVT = VT.changeElementType(MVT::f32);
4679 SDLoc DL(Op);
4680 if (IsStrict) {
4681 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4682 {Op.getOperand(0), Op.getOperand(1)});
4683 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4684 {Ext.getValue(1), Ext.getValue(0)});
4685 }
4686 return DAG.getNode(
4687 Op.getOpcode(), DL, Op.getValueType(),
4688 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4689 }
4690
4691 if (VT.isScalableVector()) {
4692 if (VT.getVectorElementType() == MVT::i1) {
4693 SDLoc DL(Op);
4694 EVT CvtVT = getPromotedVTForPredicate(VT);
4695 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4696 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4697 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4698 }
4699
4700 // Let common code split the operation.
4701 if (InVT == MVT::nxv8f32)
4702 return Op;
4703
4704 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4705 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4706 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4707 return LowerToPredicatedOp(Op, DAG, Opcode);
4708 }
4709
4710 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4711 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4712 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4713
4714 uint64_t VTSize = VT.getFixedSizeInBits();
4715 uint64_t InVTSize = InVT.getFixedSizeInBits();
4716 if (VTSize < InVTSize) {
4717 SDLoc DL(Op);
4718 if (IsStrict) {
4720 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4721 {Op.getOperand(0), Op.getOperand(1)});
4722 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4723 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4724 }
4725 SDValue Cv =
4726 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4727 Op.getOperand(0));
4728 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4729 }
4730
4731 if (VTSize > InVTSize) {
4732 SDLoc DL(Op);
4733 MVT ExtVT =
4736 if (IsStrict) {
4737 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4738 {Op.getOperand(0), Op.getOperand(1)});
4739 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4740 {Ext.getValue(1), Ext.getValue(0)});
4741 }
4742 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4743 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4744 }
4745
4746 // Use a scalar operation for conversions between single-element vectors of
4747 // the same size.
4748 if (InVT.getVectorNumElements() == 1) {
4749 SDLoc DL(Op);
4750 SDValue Extract = DAG.getNode(
4752 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4753 EVT ScalarVT = VT.getScalarType();
4754 if (IsStrict)
4755 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4756 {Op.getOperand(0), Extract});
4757 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4758 }
4759
4760 // Type changing conversions are illegal.
4761 return Op;
4762}
4763
4764SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4765 SelectionDAG &DAG) const {
4766 bool IsStrict = Op->isStrictFPOpcode();
4767 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4768
4769 if (SrcVal.getValueType().isVector())
4770 return LowerVectorFP_TO_INT(Op, DAG);
4771
4772 // f16 conversions are promoted to f32 when full fp16 is not supported.
4773 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4774 SrcVal.getValueType() == MVT::bf16) {
4775 SDLoc DL(Op);
4776 if (IsStrict) {
4777 SDValue Ext =
4778 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4779 {Op.getOperand(0), SrcVal});
4780 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4781 {Ext.getValue(1), Ext.getValue(0)});
4782 }
4783 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4784 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4785 }
4786
4787 if (SrcVal.getValueType() != MVT::f128) {
4788 // It's legal except when f128 is involved
4789 return Op;
4790 }
4791
4792 return SDValue();
4793}
4794
4795SDValue
4796AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4797 SelectionDAG &DAG) const {
4798 // AArch64 FP-to-int conversions saturate to the destination element size, so
4799 // we can lower common saturating conversions to simple instructions.
4800 SDValue SrcVal = Op.getOperand(0);
4801 EVT SrcVT = SrcVal.getValueType();
4802 EVT DstVT = Op.getValueType();
4803 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4804
4805 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4806 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4807 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4808 assert(SatWidth <= DstElementWidth &&
4809 "Saturation width cannot exceed result width");
4810
4811 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4812 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4813 // types, so this is hard to reach.
4814 if (DstVT.isScalableVector())
4815 return SDValue();
4816
4817 EVT SrcElementVT = SrcVT.getVectorElementType();
4818
4819 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4820 SDLoc DL(Op);
4821 SDValue SrcVal2;
4822 if ((SrcElementVT == MVT::f16 &&
4823 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4824 SrcElementVT == MVT::bf16) {
4825 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4826 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4827 // If we are extending to a v8f32, split into two v4f32 to produce legal
4828 // types.
4829 if (F32VT.getSizeInBits() > 128) {
4830 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4831 F32VT = F32VT.getHalfNumVectorElementsVT();
4832 }
4833 SrcVT = F32VT;
4834 SrcElementVT = MVT::f32;
4835 SrcElementWidth = 32;
4836 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4837 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4838 return SDValue();
4839
4840 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4841 // width and produce a fcvtzu.
4842 if (SatWidth == 64 && SrcElementWidth < 64) {
4843 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4844 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4845 SrcVT = F64VT;
4846 SrcElementVT = MVT::f64;
4847 SrcElementWidth = 64;
4848 }
4849 // Cases that we can emit directly.
4850 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4851 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4852 DAG.getValueType(DstVT.getScalarType()));
4853 if (SrcVal2) {
4854 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4855 DAG.getValueType(DstVT.getScalarType()));
4856 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4857 }
4858 return Res;
4859 }
4860
4861 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4862 // result. This is only valid if the legal cvt is larger than the saturate
4863 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4864 // (at least until sqxtn is selected).
4865 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4866 return SDValue();
4867
4868 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4869 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4870 DAG.getValueType(IntVT.getScalarType()));
4871 SDValue NativeCvt2 =
4872 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4873 DAG.getValueType(IntVT.getScalarType()))
4874 : SDValue();
4875 SDValue Sat, Sat2;
4876 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4877 SDValue MinC = DAG.getConstant(
4878 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4879 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4880 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4881 SDValue MaxC = DAG.getConstant(
4882 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4883 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4884 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4885 } else {
4886 SDValue MinC = DAG.getConstant(
4887 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4888 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4889 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4890 }
4891
4892 if (SrcVal2)
4893 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4895 Sat, Sat2);
4896
4897 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4898}
4899
4900SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4901 SelectionDAG &DAG) const {
4902 // AArch64 FP-to-int conversions saturate to the destination register size, so
4903 // we can lower common saturating conversions to simple instructions.
4904 SDValue SrcVal = Op.getOperand(0);
4905 EVT SrcVT = SrcVal.getValueType();
4906
4907 if (SrcVT.isVector())
4908 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4909
4910 EVT DstVT = Op.getValueType();
4911 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4912 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4913 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4914 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4915
4916 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4917 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4918 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4919 SrcVT = MVT::f32;
4920 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4921 SrcVT != MVT::bf16)
4922 return SDValue();
4923
4924 SDLoc DL(Op);
4925 // Cases that we can emit directly.
4926 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4927 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4928 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4929 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4930 DAG.getValueType(DstVT));
4931
4932 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4933 // result. This is only valid if the legal cvt is larger than the saturate
4934 // width.
4935 if (DstWidth < SatWidth)
4936 return SDValue();
4937
4938 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
4939 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4940 SDValue CVTf32 =
4941 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
4942 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
4943 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
4944 DAG.getValueType(SatVT));
4945 }
4946 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
4947 return DAG.getBitcast(DstVT, CVTf32);
4948 }
4949
4950 SDValue NativeCvt =
4951 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4952 SDValue Sat;
4953 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4954 SDValue MinC = DAG.getConstant(
4955 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4956 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4957 SDValue MaxC = DAG.getConstant(
4958 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4959 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4960 } else {
4961 SDValue MinC = DAG.getConstant(
4962 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4963 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4964 }
4965
4966 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4967}
4968
4969SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4970 SelectionDAG &DAG) const {
4971 EVT VT = Op.getValueType();
4972 SDValue Src = Op.getOperand(0);
4973 SDLoc DL(Op);
4974
4975 assert(VT.isVector() && "Expected vector type");
4976
4977 EVT CastVT =
4978 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4979
4980 // Round the floating-point value into a floating-point register with the
4981 // current rounding mode.
4982 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4983
4984 // Truncate the rounded floating point to an integer.
4985 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4987}
4988
4989SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4990 SelectionDAG &DAG) const {
4991 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4992 // Any additional optimization in this function should be recorded
4993 // in the cost tables.
4994 bool IsStrict = Op->isStrictFPOpcode();
4995 EVT VT = Op.getValueType();
4996 SDLoc DL(Op);
4997 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4998 EVT InVT = In.getValueType();
4999 unsigned Opc = Op.getOpcode();
5000 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5001
5002 assert(!(IsStrict && VT.isScalableVector()) &&
5003 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5004
5005 // NOTE: i1->bf16 does not require promotion to f32.
5006 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5007 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5008 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5009 : DAG.getConstantFP(1.0, DL, VT);
5010 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5011 }
5012
5013 // Promote bf16 conversions to f32.
5014 if (VT.getVectorElementType() == MVT::bf16) {
5015 EVT F32 = VT.changeElementType(MVT::f32);
5016 if (IsStrict) {
5017 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5018 {Op.getOperand(0), In});
5019 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5020 {Op.getValueType(), MVT::Other},
5021 {Val.getValue(1), Val.getValue(0),
5022 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5023 }
5024 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5025 DAG.getNode(Op.getOpcode(), DL, F32, In),
5026 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5027 }
5028
5029 if (VT.isScalableVector()) {
5030 // Let common code split the operation.
5031 if (VT == MVT::nxv8f32)
5032 return Op;
5033
5034 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5035 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5036 return LowerToPredicatedOp(Op, DAG, Opcode);
5037 }
5038
5039 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5040 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5041 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5042
5043 uint64_t VTSize = VT.getFixedSizeInBits();
5044 uint64_t InVTSize = InVT.getFixedSizeInBits();
5045 if (VTSize < InVTSize) {
5046 // AArch64 doesn't have a direct vector instruction to convert
5047 // fixed point to floating point AND narrow it at the same time.
5048 // Additional rounding when the target is f32/f64 causes double
5049 // rounding issues. Conversion to f16 is fine due to narrow width.
5050 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5051 bool IsTargetf16 = false;
5052 if (Op.hasOneUse() &&
5053 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5054 // Some vector types are split during legalization into half, followed by
5055 // concatenation, followed by rounding to the original vector type. If we
5056 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5057 SDNode *U = *Op->user_begin();
5058 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5059 EVT TmpVT = U->user_begin()->getValueType(0);
5060 if (TmpVT.getScalarType() == MVT::f16)
5061 IsTargetf16 = true;
5062 }
5063 }
5064
5065 if (IsTargetf32 && !IsTargetf16) {
5066 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5067 }
5068
5069 MVT CastVT =
5071 InVT.getVectorNumElements());
5072 if (IsStrict) {
5073 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5074 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5075 {In.getValue(1), In.getValue(0),
5076 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5077 }
5078 In = DAG.getNode(Opc, DL, CastVT, In);
5079 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5080 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5081 }
5082
5083 if (VTSize > InVTSize) {
5084 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5086 In = DAG.getNode(CastOpc, DL, CastVT, In);
5087 if (IsStrict)
5088 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5089 return DAG.getNode(Opc, DL, VT, In);
5090 }
5091
5092 // Use a scalar operation for conversions between single-element vectors of
5093 // the same size.
5094 if (VT.getVectorNumElements() == 1) {
5095 SDValue Extract =
5097 DAG.getConstant(0, DL, MVT::i64));
5098 EVT ScalarVT = VT.getScalarType();
5099 if (IsStrict)
5100 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5101 {Op.getOperand(0), Extract});
5102 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5103 }
5104
5105 return Op;
5106}
5107
5108SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5109 SelectionDAG &DAG) const {
5110 if (Op.getValueType().isVector())
5111 return LowerVectorINT_TO_FP(Op, DAG);
5112
5113 bool IsStrict = Op->isStrictFPOpcode();
5114 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5115
5116 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5117 Op->getOpcode() == ISD::SINT_TO_FP;
5118
5119 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5120 SDLoc DL(Op);
5121 if (IsStrict) {
5122 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5123 {Op.getOperand(0), SrcVal});
5124 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5125 {Op.getValueType(), MVT::Other},
5126 {Val.getValue(1), Val.getValue(0),
5127 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5128 }
5129 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5130 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5131 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5132 };
5133
5134 if (Op.getValueType() == MVT::bf16) {
5135 unsigned MaxWidth = IsSigned
5136 ? DAG.ComputeMaxSignificantBits(SrcVal)
5137 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5138 // bf16 conversions are promoted to f32 when converting from i16.
5139 if (MaxWidth <= 24) {
5140 return IntToFpViaPromotion(MVT::f32);
5141 }
5142
5143 // bf16 conversions are promoted to f64 when converting from i32.
5144 if (MaxWidth <= 53) {
5145 return IntToFpViaPromotion(MVT::f64);
5146 }
5147
5148 // We need to be careful about i64 -> bf16.
5149 // Consider an i32 22216703.
5150 // This number cannot be represented exactly as an f32 and so a itofp will
5151 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5152 // However, the correct bf16 was supposed to be 22151168.0
5153 // We need to use sticky rounding to get this correct.
5154 if (SrcVal.getValueType() == MVT::i64) {
5155 SDLoc DL(Op);
5156 // This algorithm is equivalent to the following:
5157 // uint64_t SrcHi = SrcVal & ~0xfffull;
5158 // uint64_t SrcLo = SrcVal & 0xfffull;
5159 // uint64_t Highest = SrcVal >> 53;
5160 // bool HasHighest = Highest != 0;
5161 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5162 // double Rounded = static_cast<double>(ToRound);
5163 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5164 // uint64_t HasLo = SrcLo != 0;
5165 // bool NeedsAdjustment = HasHighest & HasLo;
5166 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5167 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5168 // return static_cast<__bf16>(Adjusted);
5169 //
5170 // Essentially, what happens is that SrcVal either fits perfectly in a
5171 // double-precision value or it is too big. If it is sufficiently small,
5172 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5173 // ensure that u64 -> double has no rounding error by only using the 52
5174 // MSB of the input. The low order bits will get merged into a sticky bit
5175 // which will avoid issues incurred by double rounding.
5176
5177 // Signed conversion is more or less like so:
5178 // copysign((__bf16)abs(SrcVal), SrcVal)
5179 SDValue SignBit;
5180 if (IsSigned) {
5181 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5182 DAG.getConstant(1ull << 63, DL, MVT::i64));
5183 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5184 }
5185 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5186 DAG.getConstant(~0xfffull, DL, MVT::i64));
5187 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5188 DAG.getConstant(0xfffull, DL, MVT::i64));
5190 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5191 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5192 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5193 SDValue ToRound =
5194 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5195 SDValue Rounded =
5196 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5197 {Op.getOperand(0), ToRound})
5198 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5199
5200 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5201 if (SignBit) {
5202 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5203 }
5204
5205 SDValue HasHighest = DAG.getSetCC(
5206 DL,
5207 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5208 Highest, Zero64, ISD::SETNE);
5209
5210 SDValue HasLo = DAG.getSetCC(
5211 DL,
5212 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5213 SrcLo, Zero64, ISD::SETNE);
5214
5215 SDValue NeedsAdjustment =
5216 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5217 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5218
5219 SDValue AdjustedBits =
5220 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5221 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5222 return IsStrict
5223 ? DAG.getNode(
5225 {Op.getValueType(), MVT::Other},
5226 {Rounded.getValue(1), Adjusted,
5227 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5228 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5229 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5230 }
5231 }
5232
5233 // f16 conversions are promoted to f32 when full fp16 is not supported.
5234 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5235 return IntToFpViaPromotion(MVT::f32);
5236 }
5237
5238 // i128 conversions are libcalls.
5239 if (SrcVal.getValueType() == MVT::i128)
5240 return SDValue();
5241
5242 // Other conversions are legal, unless it's to the completely software-based
5243 // fp128.
5244 if (Op.getValueType() != MVT::f128)
5245 return Op;
5246 return SDValue();
5247}
5248
5249SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5250 SelectionDAG &DAG) const {
5251 // For iOS, we want to call an alternative entry point: __sincos_stret,
5252 // which returns the values in two S / D registers.
5253 SDLoc DL(Op);
5254 SDValue Arg = Op.getOperand(0);
5255 EVT ArgVT = Arg.getValueType();
5256 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5257
5259 Args.emplace_back(Arg, ArgTy);
5260
5261 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5262 : RTLIB::SINCOS_STRET_F32;
5263 const char *LibcallName = getLibcallName(LC);
5264 SDValue Callee =
5265 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5266
5267 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5270 CLI.setDebugLoc(DL)
5271 .setChain(DAG.getEntryNode())
5272 .setLibCallee(CC, RetTy, Callee, std::move(Args));
5273
5274 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5275 return CallResult.first;
5276}
5277
5278static MVT getSVEContainerType(EVT ContentTy);
5279
5280SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5281 SelectionDAG &DAG) const {
5282 EVT OpVT = Op.getValueType();
5283 EVT ArgVT = Op.getOperand(0).getValueType();
5284
5286 return LowerFixedLengthBitcastToSVE(Op, DAG);
5287
5288 if (OpVT.isScalableVector()) {
5289 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5290
5291 // Handle type legalisation first.
5292 if (!isTypeLegal(ArgVT)) {
5293 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5294 "Expected int->fp bitcast!");
5295
5296 // Bitcasting between unpacked vector types of different element counts is
5297 // not a NOP because the live elements are laid out differently.
5298 // 01234567
5299 // e.g. nxv2i32 = XX??XX??
5300 // nxv4f16 = X?X?X?X?
5301 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5302 return SDValue();
5303
5304 SDValue ExtResult =
5306 Op.getOperand(0));
5307 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5308 }
5309
5310 // Bitcasts between legal types with the same element count are legal.
5311 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5312 return Op;
5313
5314 // getSVESafeBitCast does not support casting between unpacked types.
5315 if (!isPackedVectorType(OpVT, DAG))
5316 return SDValue();
5317
5318 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5319 }
5320
5321 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5322 return SDValue();
5323
5324 // Bitcasts between f16 and bf16 are legal.
5325 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5326 return Op;
5327
5328 assert(ArgVT == MVT::i16);
5329 SDLoc DL(Op);
5330
5331 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5332 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5333 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5334}
5335
5336// Returns lane if Op extracts from a two-element vector and lane is constant
5337// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5338static std::optional<uint64_t>
5340 SDNode *OpNode = Op.getNode();
5341 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5342 return std::nullopt;
5343
5344 EVT VT = OpNode->getOperand(0).getValueType();
5345 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5346 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5347 return std::nullopt;
5348
5349 return C->getZExtValue();
5350}
5351
5353 bool isSigned) {
5354 EVT VT = N.getValueType();
5355
5356 if (N.getOpcode() != ISD::BUILD_VECTOR)
5357 return false;
5358
5359 for (const SDValue &Elt : N->op_values()) {
5360 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5361 unsigned EltSize = VT.getScalarSizeInBits();
5362 unsigned HalfSize = EltSize / 2;
5363 if (isSigned) {
5364 if (!isIntN(HalfSize, C->getSExtValue()))
5365 return false;
5366 } else {
5367 if (!isUIntN(HalfSize, C->getZExtValue()))
5368 return false;
5369 }
5370 continue;
5371 }
5372 return false;
5373 }
5374
5375 return true;
5376}
5377
5379 EVT VT = N.getValueType();
5380 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5381 EVT HalfVT = EVT::getVectorVT(
5382 *DAG.getContext(),
5385 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5386}
5387
5389 return N.getOpcode() == ISD::SIGN_EXTEND ||
5390 N.getOpcode() == ISD::ANY_EXTEND ||
5391 isExtendedBUILD_VECTOR(N, DAG, true);
5392}
5393
5395 return N.getOpcode() == ISD::ZERO_EXTEND ||
5396 N.getOpcode() == ISD::ANY_EXTEND ||
5397 isExtendedBUILD_VECTOR(N, DAG, false);
5398}
5399
5401 unsigned Opcode = N.getOpcode();
5402 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5403 SDValue N0 = N.getOperand(0);
5404 SDValue N1 = N.getOperand(1);
5405 return N0->hasOneUse() && N1->hasOneUse() &&
5406 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5407 }
5408 return false;
5409}
5410
5412 unsigned Opcode = N.getOpcode();
5413 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5414 SDValue N0 = N.getOperand(0);
5415 SDValue N1 = N.getOperand(1);
5416 return N0->hasOneUse() && N1->hasOneUse() &&
5417 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5418 }
5419 return false;
5420}
5421
5422SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5423 SelectionDAG &DAG) const {
5424 // The rounding mode is in bits 23:22 of the FPSCR.
5425 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5426 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5427 // so that the shift + and get folded into a bitfield extract.
5428 SDLoc DL(Op);
5429
5430 SDValue Chain = Op.getOperand(0);
5431 SDValue FPCR_64 = DAG.getNode(
5432 ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5433 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
5434 Chain = FPCR_64.getValue(1);
5435 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5436 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5437 DAG.getConstant(1U << 22, DL, MVT::i32));
5438 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5439 DAG.getConstant(22, DL, MVT::i32));
5440 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5441 DAG.getConstant(3, DL, MVT::i32));
5442 return DAG.getMergeValues({AND, Chain}, DL);
5443}
5444
5445SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5446 SelectionDAG &DAG) const {
5447 SDLoc DL(Op);
5448 SDValue Chain = Op->getOperand(0);
5449 SDValue RMValue = Op->getOperand(1);
5450
5451 // The rounding mode is in bits 23:22 of the FPCR.
5452 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5453 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5454 // ((arg - 1) & 3) << 22).
5455 //
5456 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5457 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5458 // generated llvm.set.rounding to ensure this condition.
5459
5460 // Calculate new value of FPCR[23:22].
5461 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5462 DAG.getConstant(1, DL, MVT::i32));
5463 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5464 DAG.getConstant(0x3, DL, MVT::i32));
5465 RMValue =
5466 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5467 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5468 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5469
5470 // Get current value of FPCR.
5471 SDValue Ops[] = {
5472 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5473 SDValue FPCR =
5474 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5475 Chain = FPCR.getValue(1);
5476 FPCR = FPCR.getValue(0);
5477
5478 // Put new rounding mode into FPSCR[23:22].
5479 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5480 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5481 DAG.getConstant(RMMask, DL, MVT::i64));
5482 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5483 SDValue Ops2[] = {
5484 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5485 FPCR};
5486 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5487}
5488
5489SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5490 SelectionDAG &DAG) const {
5491 SDLoc DL(Op);
5492 SDValue Chain = Op->getOperand(0);
5493
5494 // Get current value of FPCR.
5495 SDValue Ops[] = {
5496 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5497 SDValue FPCR =
5498 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5499 Chain = FPCR.getValue(1);
5500 FPCR = FPCR.getValue(0);
5501
5502 // Truncate FPCR to 32 bits.
5503 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5504
5505 return DAG.getMergeValues({Result, Chain}, DL);
5506}
5507
5508SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5509 SelectionDAG &DAG) const {
5510 SDLoc DL(Op);
5511 SDValue Chain = Op->getOperand(0);
5512 SDValue Mode = Op->getOperand(1);
5513
5514 // Extend the specified value to 64 bits.
5515 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5516
5517 // Set new value of FPCR.
5518 SDValue Ops2[] = {
5519 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5520 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5521}
5522
5523SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5524 SelectionDAG &DAG) const {
5525 SDLoc DL(Op);
5526 SDValue Chain = Op->getOperand(0);
5527
5528 // Get current value of FPCR.
5529 SDValue Ops[] = {
5530 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5531 SDValue FPCR =
5532 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5533 Chain = FPCR.getValue(1);
5534 FPCR = FPCR.getValue(0);
5535
5536 // Clear bits that are not reserved.
5537 SDValue FPSCRMasked = DAG.getNode(
5538 ISD::AND, DL, MVT::i64, FPCR,
5540
5541 // Set new value of FPCR.
5542 SDValue Ops2[] = {Chain,
5543 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5544 FPSCRMasked};
5545 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5546}
5547
5548static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5549 SDLoc DL, bool &IsMLA) {
5550 bool IsN0SExt = isSignExtended(N0, DAG);
5551 bool IsN1SExt = isSignExtended(N1, DAG);
5552 if (IsN0SExt && IsN1SExt)
5553 return AArch64ISD::SMULL;
5554
5555 bool IsN0ZExt = isZeroExtended(N0, DAG);
5556 bool IsN1ZExt = isZeroExtended(N1, DAG);
5557
5558 if (IsN0ZExt && IsN1ZExt)
5559 return AArch64ISD::UMULL;
5560
5561 // Select UMULL if we can replace the other operand with an extend.
5562 EVT VT = N0.getValueType();
5563 unsigned EltSize = VT.getScalarSizeInBits();
5564 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5565 if (IsN0ZExt || IsN1ZExt) {
5566 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5567 return AArch64ISD::UMULL;
5568 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5569 DAG.MaskedValueIsZero(N1, Mask)) {
5570 // For v2i64 we look more aggressively at both operands being zero, to avoid
5571 // scalarization.
5572 return AArch64ISD::UMULL;
5573 }
5574
5575 if (IsN0SExt || IsN1SExt) {
5576 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5577 return AArch64ISD::SMULL;
5578 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5579 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5580 return AArch64ISD::SMULL;
5581 }
5582
5583 if (!IsN1SExt && !IsN1ZExt)
5584 return 0;
5585
5586 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5587 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5588 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5589 IsMLA = true;
5590 return AArch64ISD::SMULL;
5591 }
5592 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5593 IsMLA = true;
5594 return AArch64ISD::UMULL;
5595 }
5596 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5597 std::swap(N0, N1);
5598 IsMLA = true;
5599 return AArch64ISD::UMULL;
5600 }
5601 return 0;
5602}
5603
5604SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5605 EVT VT = Op.getValueType();
5606
5607 bool OverrideNEON = !Subtarget->isNeonAvailable();
5608 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5609 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5610
5611 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5612 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5613 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5614 "unexpected type for custom-lowering ISD::MUL");
5615 SDValue N0 = Op.getOperand(0);
5616 SDValue N1 = Op.getOperand(1);
5617 bool isMLA = false;
5618 EVT OVT = VT;
5619 if (VT.is64BitVector()) {
5620 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5621 isNullConstant(N0.getOperand(1)) &&
5623 isNullConstant(N1.getOperand(1))) {
5624 N0 = N0.getOperand(0);
5625 N1 = N1.getOperand(0);
5626 VT = N0.getValueType();
5627 } else {
5628 if (VT == MVT::v1i64) {
5629 if (Subtarget->hasSVE())
5630 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5631 // Fall through to expand this. It is not legal.
5632 return SDValue();
5633 } else
5634 // Other vector multiplications are legal.
5635 return Op;
5636 }
5637 }
5638
5639 SDLoc DL(Op);
5640 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5641
5642 if (!NewOpc) {
5643 if (VT.getVectorElementType() == MVT::i64) {
5644 // If SVE is available then i64 vector multiplications can also be made
5645 // legal.
5646 if (Subtarget->hasSVE())
5647 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5648 // Fall through to expand this. It is not legal.
5649 return SDValue();
5650 } else
5651 // Other vector multiplications are legal.
5652 return Op;
5653 }
5654
5655 // Legalize to a S/UMULL instruction
5656 SDValue Op0;
5657 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5658 if (!isMLA) {
5659 Op0 = skipExtensionForVectorMULL(N0, DAG);
5661 Op1.getValueType().is64BitVector() &&
5662 "unexpected types for extended operands to VMULL");
5663 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5664 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5665 DAG.getConstant(0, DL, MVT::i64));
5666 }
5667 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5668 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5669 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5672 EVT Op1VT = Op1.getValueType();
5673 return DAG.getNode(
5675 DAG.getNode(N0.getOpcode(), DL, VT,
5676 DAG.getNode(NewOpc, DL, VT,
5677 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5678 DAG.getNode(NewOpc, DL, VT,
5679 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5680 DAG.getConstant(0, DL, MVT::i64));
5681}
5682
5683static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5684 int Pattern) {
5685 if (Pattern == AArch64SVEPredPattern::all)
5686 return DAG.getConstant(1, DL, VT);
5687 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5688 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5689}
5690
5692 bool IsSigned, bool IsEqual) {
5693 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5694 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5695
5696 if (!N->getValueType(0).isScalableVector() ||
5697 !isa<ConstantSDNode>(N->getOperand(Op1)))
5698 return SDValue();
5699
5700 SDLoc DL(N);
5701 APInt Y = N->getConstantOperandAPInt(Op1);
5702
5703 // When the second operand is the maximum value, comparisons that include
5704 // equality can never fail and thus we can return an all active predicate.
5705 if (IsEqual)
5706 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5707 return DAG.getConstant(1, DL, N->getValueType(0));
5708
5709 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5710 return SDValue();
5711
5712 APInt X = N->getConstantOperandAPInt(Op0);
5713
5714 bool Overflow;
5715 APInt NumActiveElems =
5716 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5717
5718 if (Overflow)
5719 return SDValue();
5720
5721 if (IsEqual) {
5722 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5723 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5724 : NumActiveElems.uadd_ov(One, Overflow);
5725 if (Overflow)
5726 return SDValue();
5727 }
5728
5729 std::optional<unsigned> PredPattern =
5731 unsigned MinSVEVectorSize = std::max(
5733 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5734 if (PredPattern != std::nullopt &&
5735 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5736 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5737
5738 return SDValue();
5739}
5740
5741// Returns a safe bitcast between two scalable vector predicates, where
5742// any newly created lanes from a widening bitcast are defined as zero.
5744 SDLoc DL(Op);
5745 EVT InVT = Op.getValueType();
5746
5747 assert(InVT.getVectorElementType() == MVT::i1 &&
5748 VT.getVectorElementType() == MVT::i1 &&
5749 "Expected a predicate-to-predicate bitcast");
5751 InVT.isScalableVector() &&
5752 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5753 "Only expect to cast between legal scalable predicate types!");
5754
5755 // Return the operand if the cast isn't changing type,
5756 if (InVT == VT)
5757 return Op;
5758
5759 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5760 // than VT. This will increase the chances of removing casts that introduce
5761 // new lanes, which have to be explicitly zero'd.
5762 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5763 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5764 Op.getOperand(1).getValueType().bitsGT(VT))
5765 Op = Op.getOperand(1);
5766
5767 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5768
5769 // We only have to zero the lanes if new lanes are being defined, e.g. when
5770 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5771 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5772 // we can return here.
5773 if (InVT.bitsGT(VT))
5774 return Reinterpret;
5775
5776 // Check if the other lanes are already known to be zeroed by
5777 // construction.
5779 return Reinterpret;
5780
5781 // Zero the newly introduced lanes.
5782 SDValue Mask = DAG.getConstant(1, DL, InVT);
5783 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5784 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5785}
5786
5787SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5788 SDValue Chain, SDLoc DL,
5789 EVT VT) const {
5790 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5793 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5794 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5797 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5798 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5799 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5800 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5801 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5802 Mask);
5803}
5804
5805// Lower an SME LDR/STR ZA intrinsic
5806// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5807// folded into the instruction
5808// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5809// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5810// and tile slice registers
5811// ldr(%tileslice, %ptr, %vecnum)
5812// ->
5813// %svl = rdsvl
5814// %ptr2 = %ptr + %svl * %vecnum
5815// %tileslice2 = %tileslice + %vecnum
5816// ldr [%tileslice2, 0], [%ptr2, 0]
5817// Case 3: If the vecnum is an immediate out of range, then the same is done as
5818// case 2, but the base and slice registers are modified by the greatest
5819// multiple of 15 lower than the vecnum and the remainder is folded into the
5820// instruction. This means that successive loads and stores that are offset from
5821// each other can share the same base and slice register updates.
5822// ldr(%tileslice, %ptr, 22)
5823// ldr(%tileslice, %ptr, 23)
5824// ->
5825// %svl = rdsvl
5826// %ptr2 = %ptr + %svl * 15
5827// %tileslice2 = %tileslice + 15
5828// ldr [%tileslice2, 7], [%ptr2, 7]
5829// ldr [%tileslice2, 8], [%ptr2, 8]
5830// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5831// operand and the immediate can be folded into the instruction, like case 2.
5832// ldr(%tileslice, %ptr, %vecnum + 7)
5833// ldr(%tileslice, %ptr, %vecnum + 8)
5834// ->
5835// %svl = rdsvl
5836// %ptr2 = %ptr + %svl * %vecnum
5837// %tileslice2 = %tileslice + %vecnum
5838// ldr [%tileslice2, 7], [%ptr2, 7]
5839// ldr [%tileslice2, 8], [%ptr2, 8]
5840// Case 5: The vecnum being an add of an immediate out of range is also handled,
5841// in which case the same remainder logic as case 3 is used.
5843 SDLoc DL(N);
5844
5845 SDValue TileSlice = N->getOperand(2);
5846 SDValue Base = N->getOperand(3);
5847 SDValue VecNum = N->getOperand(4);
5848 int32_t ConstAddend = 0;
5849 SDValue VarAddend = VecNum;
5850
5851 // If the vnum is an add of an immediate, we can fold it into the instruction
5852 if (VecNum.getOpcode() == ISD::ADD &&
5853 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5854 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5855 VarAddend = VecNum.getOperand(0);
5856 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5857 ConstAddend = ImmNode->getSExtValue();
5858 VarAddend = SDValue();
5859 }
5860
5861 int32_t ImmAddend = ConstAddend % 16;
5862 if (int32_t C = (ConstAddend - ImmAddend)) {
5863 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5864 VarAddend = VarAddend
5865 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5866 : CVal;
5867 }
5868
5869 if (VarAddend) {
5870 // Get the vector length that will be multiplied by vnum
5871 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5872 DAG.getConstant(1, DL, MVT::i32));
5873
5874 // Multiply SVL and vnum then add it to the base
5875 SDValue Mul = DAG.getNode(
5876 ISD::MUL, DL, MVT::i64,
5877 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5878 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5879 // Just add vnum to the tileslice
5880 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5881 }
5882
5883 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5884 DL, MVT::Other,
5885 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5886 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5887}
5888
5890 SDLoc DL(Op);
5891 SDValue ID =
5892 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
5893
5894 auto Op1 = Op.getOperand(1);
5895 auto Op2 = Op.getOperand(2);
5896 auto Mask = Op.getOperand(3);
5897
5898 EVT Op1VT = Op1.getValueType();
5899 EVT Op2VT = Op2.getValueType();
5900 EVT ResVT = Op.getValueType();
5901
5902 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5903 Op1VT.getVectorElementType() == MVT::i16) &&
5904 "Expected 8-bit or 16-bit characters.");
5905
5906 // Scalable vector type used to wrap operands.
5907 // A single container is enough for both operands because ultimately the
5908 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5909 EVT OpContainerVT = Op1VT.isScalableVector()
5910 ? Op1VT
5912
5913 if (Op2VT.is128BitVector()) {
5914 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5915 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
5916 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5917 if (ResVT.isScalableVector())
5918 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
5919 DAG.getTargetConstant(0, DL, MVT::i64));
5920 } else {
5921 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5922 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5923 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
5924 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
5925 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
5926 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
5927 DAG.getConstant(0, DL, MVT::i64));
5928 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
5929 Op2 = DAG.getBitcast(OpContainerVT, Op2);
5930 }
5931
5932 // If the result is scalable, we just need to carry out the MATCH.
5933 if (ResVT.isScalableVector())
5934 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
5935
5936 // If the result is fixed, we can still use MATCH but we need to wrap the
5937 // first operand and the mask in scalable vectors before doing so.
5938
5939 // Wrap the operands.
5940 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
5941 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
5942 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5943
5944 // Carry out the match.
5945 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
5946 ID, Mask, Op1, Op2);
5947
5948 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5949 // (v16i8/v8i8).
5950 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
5951 Match = convertFromScalableVector(DAG, Op1VT, Match);
5952 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
5953}
5954
5955SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5956 SelectionDAG &DAG) const {
5957 unsigned IntNo = Op.getConstantOperandVal(1);
5958 SDLoc DL(Op);
5959 switch (IntNo) {
5960 default:
5961 return SDValue(); // Don't custom lower most intrinsics.
5962 case Intrinsic::aarch64_prefetch: {
5963 SDValue Chain = Op.getOperand(0);
5964 SDValue Addr = Op.getOperand(2);
5965
5966 unsigned IsWrite = Op.getConstantOperandVal(3);
5967 unsigned Locality = Op.getConstantOperandVal(4);
5968 unsigned IsStream = Op.getConstantOperandVal(5);
5969 unsigned IsData = Op.getConstantOperandVal(6);
5970 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5971 (!IsData << 3) | // IsDataCache bit
5972 (Locality << 1) | // Cache level bits
5973 (unsigned)IsStream; // Stream bit
5974
5975 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5976 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5977 }
5978 case Intrinsic::aarch64_sme_str:
5979 case Intrinsic::aarch64_sme_ldr: {
5980 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5981 }
5982 case Intrinsic::aarch64_sme_za_enable:
5983 return DAG.getNode(
5984 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
5985 Op->getOperand(0), // Chain
5986 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
5987 case Intrinsic::aarch64_sme_za_disable:
5988 return DAG.getNode(
5989 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
5990 Op->getOperand(0), // Chain
5991 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
5992 }
5993}
5994
5995SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5996 SelectionDAG &DAG) const {
5997 unsigned IntNo = Op.getConstantOperandVal(1);
5998 SDLoc DL(Op);
5999 switch (IntNo) {
6000 default:
6001 return SDValue(); // Don't custom lower most intrinsics.
6002 case Intrinsic::aarch64_mops_memset_tag: {
6003 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6004 SDValue Chain = Node->getChain();
6005 SDValue Dst = Op.getOperand(2);
6006 SDValue Val = Op.getOperand(3);
6007 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6008 SDValue Size = Op.getOperand(4);
6009 auto Alignment = Node->getMemOperand()->getAlign();
6010 bool IsVol = Node->isVolatile();
6011 auto DstPtrInfo = Node->getPointerInfo();
6012
6013 const auto &SDI =
6014 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6015 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6016 Chain, Dst, Val, Size, Alignment, IsVol,
6017 DstPtrInfo, MachinePointerInfo{});
6018
6019 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6020 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6021 // LowerOperationWrapper will complain that the number of results has
6022 // changed.
6023 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6024 }
6025 }
6026}
6027
6028SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6029 SelectionDAG &DAG) const {
6030 unsigned IntNo = Op.getConstantOperandVal(0);
6031 SDLoc DL(Op);
6032 switch (IntNo) {
6033 default: return SDValue(); // Don't custom lower most intrinsics.
6034 case Intrinsic::thread_pointer: {
6035 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6036 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6037 }
6038 case Intrinsic::aarch64_neon_abs: {
6039 EVT Ty = Op.getValueType();
6040 if (Ty == MVT::i64) {
6041 SDValue Result =
6042 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6043 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6044 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6045 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6046 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6047 } else {
6048 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6049 }
6050 }
6051 case Intrinsic::aarch64_neon_pmull64: {
6052 SDValue LHS = Op.getOperand(1);
6053 SDValue RHS = Op.getOperand(2);
6054
6055 std::optional<uint64_t> LHSLane =
6057 std::optional<uint64_t> RHSLane =
6059
6060 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6061 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6062
6063 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6064 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6065 // which ISel recognizes better. For example, generate a ldr into d*
6066 // registers as opposed to a GPR load followed by a fmov.
6067 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6068 std::optional<uint64_t> OtherLane,
6069 const SDLoc &DL,
6070 SelectionDAG &DAG) -> SDValue {
6071 // If the operand is an higher half itself, rewrite it to
6072 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6073 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6074 if (NLane == 1)
6075 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6076 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6077
6078 // Operand N is not a higher half but the other operand is.
6079 if (OtherLane == 1) {
6080 // If this operand is a lower half, rewrite it to
6081 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6082 // align lanes of two operands. A roundtrip sequence (to move from lane
6083 // 1 to lane 0) is like this:
6084 // mov x8, v0.d[1]
6085 // fmov d0, x8
6086 if (NLane == 0)
6087 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6088 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6089 N.getOperand(0),
6090 DAG.getConstant(0, DL, MVT::i64)),
6091 DAG.getConstant(1, DL, MVT::i64));
6092
6093 // Otherwise just dup from main to all lanes.
6094 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6095 }
6096
6097 // Neither operand is an extract of higher half, so codegen may just use
6098 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6099 assert(N.getValueType() == MVT::i64 &&
6100 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6101 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6102 };
6103
6104 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6105 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6106
6107 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6108 }
6109 case Intrinsic::aarch64_neon_smax:
6110 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6111 Op.getOperand(2));
6112 case Intrinsic::aarch64_neon_umax:
6113 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6114 Op.getOperand(2));
6115 case Intrinsic::aarch64_neon_smin:
6116 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6117 Op.getOperand(2));
6118 case Intrinsic::aarch64_neon_umin:
6119 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6120 Op.getOperand(2));
6121 case Intrinsic::aarch64_neon_scalar_sqxtn:
6122 case Intrinsic::aarch64_neon_scalar_sqxtun:
6123 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6124 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6125 if (Op.getValueType() == MVT::i32)
6126 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6127 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6128 Op.getOperand(0),
6129 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6130 Op.getOperand(1))));
6131 return SDValue();
6132 }
6133 case Intrinsic::aarch64_neon_sqxtn:
6134 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6135 Op.getOperand(1));
6136 case Intrinsic::aarch64_neon_sqxtun:
6137 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6138 Op.getOperand(1));
6139 case Intrinsic::aarch64_neon_uqxtn:
6140 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6141 Op.getOperand(1));
6142 case Intrinsic::aarch64_neon_sqshrn:
6143 if (Op.getValueType().isVector())
6144 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6145 DAG.getNode(AArch64ISD::VASHR, DL,
6146 Op.getOperand(1).getValueType(),
6147 Op.getOperand(1), Op.getOperand(2)));
6148 return SDValue();
6149 case Intrinsic::aarch64_neon_sqshrun:
6150 if (Op.getValueType().isVector())
6151 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6152 DAG.getNode(AArch64ISD::VASHR, DL,
6153 Op.getOperand(1).getValueType(),
6154 Op.getOperand(1), Op.getOperand(2)));
6155 return SDValue();
6156 case Intrinsic::aarch64_neon_uqshrn:
6157 if (Op.getValueType().isVector())
6158 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6159 DAG.getNode(AArch64ISD::VLSHR, DL,
6160 Op.getOperand(1).getValueType(),
6161 Op.getOperand(1), Op.getOperand(2)));
6162 return SDValue();
6163 case Intrinsic::aarch64_neon_sqrshrn:
6164 if (Op.getValueType().isVector())
6165 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6166 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6167 Op.getOperand(1).getValueType(),
6168 Op.getOperand(1), Op.getOperand(2)));
6169 return SDValue();
6170 case Intrinsic::aarch64_neon_sqrshrun:
6171 if (Op.getValueType().isVector())
6172 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6173 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6174 Op.getOperand(1).getValueType(),
6175 Op.getOperand(1), Op.getOperand(2)));
6176 return SDValue();
6177 case Intrinsic::aarch64_neon_uqrshrn:
6178 if (Op.getValueType().isVector())
6179 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6180 DAG.getNode(AArch64ISD::URSHR_I, DL,
6181 Op.getOperand(1).getValueType(),
6182 Op.getOperand(1), Op.getOperand(2)));
6183 return SDValue();
6184 case Intrinsic::aarch64_neon_sqadd:
6185 if (Op.getValueType().isVector())
6186 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6187 Op.getOperand(2));
6188 return SDValue();
6189 case Intrinsic::aarch64_neon_sqsub:
6190 if (Op.getValueType().isVector())
6191 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6192 Op.getOperand(2));
6193 return SDValue();
6194 case Intrinsic::aarch64_neon_uqadd:
6195 if (Op.getValueType().isVector())
6196 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6197 Op.getOperand(2));
6198 return SDValue();
6199 case Intrinsic::aarch64_neon_uqsub:
6200 if (Op.getValueType().isVector())
6201 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6202 Op.getOperand(2));
6203 return SDValue();
6204 case Intrinsic::aarch64_sve_whilelt:
6205 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6206 /*IsEqual=*/false);
6207 case Intrinsic::aarch64_sve_whilels:
6208 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6209 /*IsEqual=*/true);
6210 case Intrinsic::aarch64_sve_whilele:
6211 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6212 /*IsEqual=*/true);
6213 case Intrinsic::aarch64_sve_sunpkhi:
6214 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6215 Op.getOperand(1));
6216 case Intrinsic::aarch64_sve_sunpklo:
6217 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6218 Op.getOperand(1));
6219 case Intrinsic::aarch64_sve_uunpkhi:
6220 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6221 Op.getOperand(1));
6222 case Intrinsic::aarch64_sve_uunpklo:
6223 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6224 Op.getOperand(1));
6225 case Intrinsic::aarch64_sve_clasta_n:
6226 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6227 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6228 case Intrinsic::aarch64_sve_clastb_n:
6229 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6230 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6231 case Intrinsic::aarch64_sve_lasta:
6232 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6233 Op.getOperand(1), Op.getOperand(2));
6234 case Intrinsic::aarch64_sve_lastb:
6235 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6236 Op.getOperand(1), Op.getOperand(2));
6237 case Intrinsic::aarch64_sve_rev:
6238 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6239 Op.getOperand(1));
6240 case Intrinsic::aarch64_sve_tbl:
6241 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6242 Op.getOperand(2));
6243 case Intrinsic::aarch64_sve_trn1:
6244 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6245 Op.getOperand(1), Op.getOperand(2));
6246 case Intrinsic::aarch64_sve_trn2:
6247 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6248 Op.getOperand(1), Op.getOperand(2));
6249 case Intrinsic::aarch64_sve_uzp1:
6250 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6251 Op.getOperand(1), Op.getOperand(2));
6252 case Intrinsic::aarch64_sve_uzp2:
6253 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6254 Op.getOperand(1), Op.getOperand(2));
6255 case Intrinsic::aarch64_sve_zip1:
6256 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6257 Op.getOperand(1), Op.getOperand(2));
6258 case Intrinsic::aarch64_sve_zip2:
6259 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6260 Op.getOperand(1), Op.getOperand(2));
6261 case Intrinsic::aarch64_sve_splice:
6262 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6263 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6264 case Intrinsic::aarch64_sve_ptrue:
6265 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6266 case Intrinsic::aarch64_sve_clz:
6267 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6268 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6269 case Intrinsic::aarch64_sme_cntsb:
6270 return DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6271 DAG.getConstant(1, DL, MVT::i32));
6272 case Intrinsic::aarch64_sme_cntsh: {
6273 SDValue One = DAG.getConstant(1, DL, MVT::i32);
6274 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), One);
6275 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes, One);
6276 }
6277 case Intrinsic::aarch64_sme_cntsw: {
6278 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6279 DAG.getConstant(1, DL, MVT::i32));
6280 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6281 DAG.getConstant(2, DL, MVT::i32));
6282 }
6283 case Intrinsic::aarch64_sme_cntsd: {
6284 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6285 DAG.getConstant(1, DL, MVT::i32));
6286 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6287 DAG.getConstant(3, DL, MVT::i32));
6288 }
6289 case Intrinsic::aarch64_sve_cnt: {
6290 SDValue Data = Op.getOperand(3);
6291 // CTPOP only supports integer operands.
6292 if (Data.getValueType().isFloatingPoint())
6293 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6294 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6295 Op.getOperand(2), Data, Op.getOperand(1));
6296 }
6297 case Intrinsic::aarch64_sve_dupq_lane:
6298 return LowerDUPQLane(Op, DAG);
6299 case Intrinsic::aarch64_sve_convert_from_svbool:
6300 if (Op.getValueType() == MVT::aarch64svcount)
6301 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6302 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6303 case Intrinsic::aarch64_sve_convert_to_svbool:
6304 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6305 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6306 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6307 case Intrinsic::aarch64_sve_fneg:
6308 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6309 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6310 case Intrinsic::aarch64_sve_frintp:
6311 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6312 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6313 case Intrinsic::aarch64_sve_frintm:
6314 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6315 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6316 case Intrinsic::aarch64_sve_frinti:
6317 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6318 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6319 Op.getOperand(1));
6320 case Intrinsic::aarch64_sve_frintx:
6321 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6322 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6323 case Intrinsic::aarch64_sve_frinta:
6324 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6325 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6326 case Intrinsic::aarch64_sve_frintn:
6327 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6328 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6329 Op.getOperand(1));
6330 case Intrinsic::aarch64_sve_frintz:
6331 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6332 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6333 case Intrinsic::aarch64_sve_ucvtf:
6334 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6335 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6336 Op.getOperand(1));
6337 case Intrinsic::aarch64_sve_scvtf:
6338 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6339 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6340 Op.getOperand(1));
6341 case Intrinsic::aarch64_sve_fcvtzu:
6342 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6343 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6344 case Intrinsic::aarch64_sve_fcvtzs:
6345 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6346 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6347 case Intrinsic::aarch64_sve_fsqrt:
6348 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6349 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6350 case Intrinsic::aarch64_sve_frecpx:
6351 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6352 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6353 case Intrinsic::aarch64_sve_frecpe_x:
6354 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6355 Op.getOperand(1));
6356 case Intrinsic::aarch64_sve_frecps_x:
6357 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6358 Op.getOperand(1), Op.getOperand(2));
6359 case Intrinsic::aarch64_sve_frsqrte_x:
6360 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6361 Op.getOperand(1));
6362 case Intrinsic::aarch64_sve_frsqrts_x:
6363 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6364 Op.getOperand(1), Op.getOperand(2));
6365 case Intrinsic::aarch64_sve_fabs:
6366 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6367 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6368 case Intrinsic::aarch64_sve_abs:
6369 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6370 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6371 case Intrinsic::aarch64_sve_neg:
6372 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6373 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6374 case Intrinsic::aarch64_sve_insr: {
6375 SDValue Scalar = Op.getOperand(2);
6376 EVT ScalarTy = Scalar.getValueType();
6377 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6378 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6379
6380 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6381 Op.getOperand(1), Scalar);
6382 }
6383 case Intrinsic::aarch64_sve_rbit:
6384 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6385 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6386 Op.getOperand(1));
6387 case Intrinsic::aarch64_sve_revb:
6388 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6389 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6390 case Intrinsic::aarch64_sve_revh:
6391 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6392 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6393 case Intrinsic::aarch64_sve_revw:
6394 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6395 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6396 case Intrinsic::aarch64_sve_revd:
6397 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6398 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6399 case Intrinsic::aarch64_sve_sxtb:
6400 return DAG.getNode(
6401 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6402 Op.getOperand(2), Op.getOperand(3),
6403 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6404 Op.getOperand(1));
6405 case Intrinsic::aarch64_sve_sxth:
6406 return DAG.getNode(
6407 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6408 Op.getOperand(2), Op.getOperand(3),
6409 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6410 Op.getOperand(1));
6411 case Intrinsic::aarch64_sve_sxtw:
6412 return DAG.getNode(
6413 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6414 Op.getOperand(2), Op.getOperand(3),
6415 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6416 Op.getOperand(1));
6417 case Intrinsic::aarch64_sve_uxtb:
6418 return DAG.getNode(
6419 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6420 Op.getOperand(2), Op.getOperand(3),
6421 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6422 Op.getOperand(1));
6423 case Intrinsic::aarch64_sve_uxth:
6424 return DAG.getNode(
6425 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6426 Op.getOperand(2), Op.getOperand(3),
6427 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6428 Op.getOperand(1));
6429 case Intrinsic::aarch64_sve_uxtw:
6430 return DAG.getNode(
6431 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6432 Op.getOperand(2), Op.getOperand(3),
6433 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6434 Op.getOperand(1));
6435 case Intrinsic::localaddress: {
6436 const auto &MF = DAG.getMachineFunction();
6437 const auto *RegInfo = Subtarget->getRegisterInfo();
6438 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6439 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6440 Op.getSimpleValueType());
6441 }
6442
6443 case Intrinsic::eh_recoverfp: {
6444 // FIXME: This needs to be implemented to correctly handle highly aligned
6445 // stack objects. For now we simply return the incoming FP. Refer D53541
6446 // for more details.
6447 SDValue FnOp = Op.getOperand(1);
6448 SDValue IncomingFPOp = Op.getOperand(2);
6449 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6450 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6451 if (!Fn)
6453 "llvm.eh.recoverfp must take a function as the first argument");
6454 return IncomingFPOp;
6455 }
6456
6457 case Intrinsic::aarch64_neon_vsri:
6458 case Intrinsic::aarch64_neon_vsli:
6459 case Intrinsic::aarch64_sve_sri:
6460 case Intrinsic::aarch64_sve_sli: {
6461 EVT Ty = Op.getValueType();
6462
6463 if (!Ty.isVector())
6464 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6465
6466 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6467
6468 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6469 IntNo == Intrinsic::aarch64_sve_sri;
6470 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6471 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6472 Op.getOperand(3));
6473 }
6474
6475 case Intrinsic::aarch64_neon_srhadd:
6476 case Intrinsic::aarch64_neon_urhadd:
6477 case Intrinsic::aarch64_neon_shadd:
6478 case Intrinsic::aarch64_neon_uhadd: {
6479 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6480 IntNo == Intrinsic::aarch64_neon_shadd);
6481 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6482 IntNo == Intrinsic::aarch64_neon_urhadd);
6483 unsigned Opcode = IsSignedAdd
6484 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6485 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6486 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6487 Op.getOperand(2));
6488 }
6489 case Intrinsic::aarch64_neon_saddlp:
6490 case Intrinsic::aarch64_neon_uaddlp: {
6491 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6492 ? AArch64ISD::UADDLP
6493 : AArch64ISD::SADDLP;
6494 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6495 }
6496 case Intrinsic::aarch64_neon_sdot:
6497 case Intrinsic::aarch64_neon_udot:
6498 case Intrinsic::aarch64_sve_sdot:
6499 case Intrinsic::aarch64_sve_udot: {
6500 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6501 IntNo == Intrinsic::aarch64_sve_udot)
6502 ? AArch64ISD::UDOT
6503 : AArch64ISD::SDOT;
6504 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6505 Op.getOperand(2), Op.getOperand(3));
6506 }
6507 case Intrinsic::aarch64_neon_usdot:
6508 case Intrinsic::aarch64_sve_usdot: {
6509 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6510 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6511 }
6512 case Intrinsic::aarch64_neon_saddlv:
6513 case Intrinsic::aarch64_neon_uaddlv: {
6514 EVT OpVT = Op.getOperand(1).getValueType();
6515 EVT ResVT = Op.getValueType();
6516 assert(
6517 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6518 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6519 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6520 "Unexpected aarch64_neon_u/saddlv type");
6521 (void)OpVT;
6522 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6523 SDValue ADDLV = DAG.getNode(
6524 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6525 : AArch64ISD::SADDLV,
6526 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6527 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6528 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6529 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6530 return EXTRACT_VEC_ELT;
6531 }
6532 case Intrinsic::experimental_cttz_elts: {
6533 SDValue CttzOp = Op.getOperand(1);
6534 EVT VT = CttzOp.getValueType();
6535 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6536
6537 if (VT.isFixedLengthVector()) {
6538 // We can use SVE instructions to lower this intrinsic by first creating
6539 // an SVE predicate register mask from the fixed-width vector.
6540 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6541 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6542 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6543 }
6544
6545 SDValue NewCttzElts =
6546 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6547 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6548 }
6549 case Intrinsic::experimental_vector_match: {
6550 return LowerVectorMatch(Op, DAG);
6551 }
6552 }
6553}
6554
6555bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6556 if (VT.getVectorElementType() == MVT::i8 ||
6557 VT.getVectorElementType() == MVT::i16) {
6558 EltTy = MVT::i32;
6559 return true;
6560 }
6561 return false;
6562}
6563
6564bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6565 EVT DataVT) const {
6566 const EVT IndexVT = Extend.getOperand(0).getValueType();
6567 // SVE only supports implicit extension of 32-bit indices.
6568 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6569 return false;
6570
6571 // Indices cannot be smaller than the main data type.
6572 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6573 return false;
6574
6575 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6576 // element container type, which would violate the previous clause.
6577 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6578}
6579
6580bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6581 EVT ExtVT = ExtVal.getValueType();
6582 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6583 return false;
6584
6585 // It may be worth creating extending masked loads if there are multiple
6586 // masked loads using the same predicate. That way we'll end up creating
6587 // extending masked loads that may then get split by the legaliser. This
6588 // results in just one set of predicate unpacks at the start, instead of
6589 // multiple sets of vector unpacks after each load.
6590 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6591 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6592 // Disable extending masked loads for fixed-width for now, since the code
6593 // quality doesn't look great.
6594 if (!ExtVT.isScalableVector())
6595 return false;
6596
6597 unsigned NumExtMaskedLoads = 0;
6598 for (auto *U : Ld->getMask()->users())
6599 if (isa<MaskedLoadSDNode>(U))
6600 NumExtMaskedLoads++;
6601
6602 if (NumExtMaskedLoads <= 1)
6603 return false;
6604 }
6605 }
6606
6607 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6608 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6609 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6610}
6611
6612unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6613 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6614 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6615 AArch64ISD::GLD1_MERGE_ZERO},
6616 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6617 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6618 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6619 AArch64ISD::GLD1_MERGE_ZERO},
6620 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6621 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6622 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6623 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6624 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6625 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6626 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6627 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6628 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6629 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6630 };
6631 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6632 return AddrModes.find(Key)->second;
6633}
6634
6635unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6636 switch (Opcode) {
6637 default:
6638 llvm_unreachable("unimplemented opcode");
6639 return Opcode;
6640 case AArch64ISD::GLD1_MERGE_ZERO:
6641 return AArch64ISD::GLD1S_MERGE_ZERO;
6642 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6643 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6644 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6645 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6646 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6647 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6648 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6649 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6650 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6651 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6652 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6653 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6654 }
6655}
6656
6657SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6658 SelectionDAG &DAG) const {
6659 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6660
6661 SDLoc DL(Op);
6662 SDValue Chain = MGT->getChain();
6663 SDValue PassThru = MGT->getPassThru();
6664 SDValue Mask = MGT->getMask();
6665 SDValue BasePtr = MGT->getBasePtr();
6666 SDValue Index = MGT->getIndex();
6667 SDValue Scale = MGT->getScale();
6668 EVT VT = Op.getValueType();
6669 EVT MemVT = MGT->getMemoryVT();
6670 ISD::LoadExtType ExtType = MGT->getExtensionType();
6671 ISD::MemIndexType IndexType = MGT->getIndexType();
6672
6673 // SVE supports zero (and so undef) passthrough values only, everything else
6674 // must be handled manually by an explicit select on the load's output.
6675 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6676 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6677 SDValue Load =
6678 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6679 MGT->getMemOperand(), IndexType, ExtType);
6680 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6681 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6682 }
6683
6684 bool IsScaled = MGT->isIndexScaled();
6685 bool IsSigned = MGT->isIndexSigned();
6686
6687 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6688 // must be calculated before hand.
6689 uint64_t ScaleVal = Scale->getAsZExtVal();
6690 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6691 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6692 EVT IndexVT = Index.getValueType();
6693 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6694 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6695 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6696
6697 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6698 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6699 MGT->getMemOperand(), IndexType, ExtType);
6700 }
6701
6702 // Lower fixed length gather to a scalable equivalent.
6703 if (VT.isFixedLengthVector()) {
6704 assert(Subtarget->useSVEForFixedLengthVectors() &&
6705 "Cannot lower when not using SVE for fixed vectors!");
6706
6707 // NOTE: Handle floating-point as if integer then bitcast the result.
6709 MemVT = MemVT.changeVectorElementTypeToInteger();
6710
6711 // Find the smallest integer fixed length vector we can use for the gather.
6712 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6713 if (DataVT.getVectorElementType() == MVT::i64 ||
6714 Index.getValueType().getVectorElementType() == MVT::i64 ||
6715 Mask.getValueType().getVectorElementType() == MVT::i64)
6716 PromotedVT = VT.changeVectorElementType(MVT::i64);
6717
6718 // Promote vector operands except for passthrough, which we know is either
6719 // undef or zero, and thus best constructed directly.
6720 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6721 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6722 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6723
6724 // A promoted result type forces the need for an extending load.
6725 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6726 ExtType = ISD::EXTLOAD;
6727
6728 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6729
6730 // Convert fixed length vector operands to scalable.
6731 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6732 Index = convertToScalableVector(DAG, ContainerVT, Index);
6734 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6735 : DAG.getConstant(0, DL, ContainerVT);
6736
6737 // Emit equivalent scalable vector gather.
6738 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6739 SDValue Load =
6740 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6741 Ops, MGT->getMemOperand(), IndexType, ExtType);
6742
6743 // Extract fixed length data then convert to the required result type.
6744 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6745 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6746 if (VT.isFloatingPoint())
6747 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6748
6749 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6750 }
6751
6752 // Everything else is legal.
6753 return Op;
6754}
6755
6756SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6757 SelectionDAG &DAG) const {
6758 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6759
6760 SDLoc DL(Op);
6761 SDValue Chain = MSC->getChain();
6762 SDValue StoreVal = MSC->getValue();
6763 SDValue Mask = MSC->getMask();
6764 SDValue BasePtr = MSC->getBasePtr();
6765 SDValue Index = MSC->getIndex();
6766 SDValue Scale = MSC->getScale();
6767 EVT VT = StoreVal.getValueType();
6768 EVT MemVT = MSC->getMemoryVT();
6769 ISD::MemIndexType IndexType = MSC->getIndexType();
6770 bool Truncating = MSC->isTruncatingStore();
6771
6772 bool IsScaled = MSC->isIndexScaled();
6773 bool IsSigned = MSC->isIndexSigned();
6774
6775 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6776 // must be calculated before hand.
6777 uint64_t ScaleVal = Scale->getAsZExtVal();
6778 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6779 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6780 EVT IndexVT = Index.getValueType();
6781 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6782 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6783 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6784
6785 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6786 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6787 MSC->getMemOperand(), IndexType, Truncating);
6788 }
6789
6790 // Lower fixed length scatter to a scalable equivalent.
6791 if (VT.isFixedLengthVector()) {
6792 assert(Subtarget->useSVEForFixedLengthVectors() &&
6793 "Cannot lower when not using SVE for fixed vectors!");
6794
6795 // Once bitcast we treat floating-point scatters as if integer.
6796 if (VT.isFloatingPoint()) {
6798 MemVT = MemVT.changeVectorElementTypeToInteger();
6799 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6800 }
6801
6802 // Find the smallest integer fixed length vector we can use for the scatter.
6803 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6804 if (VT.getVectorElementType() == MVT::i64 ||
6805 Index.getValueType().getVectorElementType() == MVT::i64 ||
6806 Mask.getValueType().getVectorElementType() == MVT::i64)
6807 PromotedVT = VT.changeVectorElementType(MVT::i64);
6808
6809 // Promote vector operands.
6810 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6811 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6812 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6813 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6814
6815 // A promoted value type forces the need for a truncating store.
6816 if (PromotedVT != VT)
6817 Truncating = true;
6818
6819 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6820
6821 // Convert fixed length vector operands to scalable.
6822 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6823 Index = convertToScalableVector(DAG, ContainerVT, Index);
6825 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6826
6827 // Emit equivalent scalable vector scatter.
6828 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6829 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6830 MSC->getMemOperand(), IndexType, Truncating);
6831 }
6832
6833 // Everything else is legal.
6834 return Op;
6835}
6836
6837SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6838 SDLoc DL(Op);
6839 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6840 assert(LoadNode && "Expected custom lowering of a masked load node");
6841 EVT VT = Op->getValueType(0);
6842
6843 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6844 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6845
6846 SDValue PassThru = LoadNode->getPassThru();
6847 SDValue Mask = LoadNode->getMask();
6848
6849 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6850 return Op;
6851
6853 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6854 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6855 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6856 LoadNode->getExtensionType());
6857
6858 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6859
6860 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6861}
6862
6863// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6865 EVT VT, EVT MemVT,
6866 SelectionDAG &DAG) {
6867 assert(VT.isVector() && "VT should be a vector type");
6868 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6869
6870 SDValue Value = ST->getValue();
6871
6872 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6873 // the word lane which represent the v4i8 subvector. It optimizes the store
6874 // to:
6875 //
6876 // xtn v0.8b, v0.8h
6877 // str s0, [x0]
6878
6879 SDValue Undef = DAG.getUNDEF(MVT::i16);
6880 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6881 {Undef, Undef, Undef, Undef});
6882
6883 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6884 Value, UndefVec);
6885 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6886
6887 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6888 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6889 Trunc, DAG.getConstant(0, DL, MVT::i64));
6890
6891 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6892 ST->getBasePtr(), ST->getMemOperand());
6893}
6894
6896 SDLoc DL(Op);
6897 SDValue Src = Op.getOperand(0);
6898 MVT DestVT = Op.getSimpleValueType();
6899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6900 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
6901
6902 unsigned SrcAS = N->getSrcAddressSpace();
6903 unsigned DestAS = N->getDestAddressSpace();
6904 assert(SrcAS != DestAS &&
6905 "addrspacecast must be between different address spaces");
6906 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
6907 TLI.getTargetMachine().getPointerSize(DestAS) &&
6908 "addrspacecast must be between different ptr sizes");
6909 (void)TLI;
6910
6911 if (SrcAS == ARM64AS::PTR32_SPTR) {
6912 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
6913 DAG.getTargetConstant(0, DL, DestVT));
6914 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
6915 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
6916 DAG.getTargetConstant(0, DL, DestVT));
6917 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
6918 (DestAS == ARM64AS::PTR32_UPTR)) {
6919 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
6920 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
6921 return Trunc;
6922 } else {
6923 return Src;
6924 }
6925}
6926
6927// Custom lowering for any store, vector or scalar and/or default or with
6928// a truncate operations. Currently only custom lower truncate operation
6929// from vector v4i16 to v4i8 or volatile stores of i128.
6930SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6931 SelectionDAG &DAG) const {
6932 SDLoc Dl(Op);
6933 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6934 assert (StoreNode && "Can only custom lower store nodes");
6935
6936 SDValue Value = StoreNode->getValue();
6937
6938 EVT VT = Value.getValueType();
6939 EVT MemVT = StoreNode->getMemoryVT();
6940
6941 if (VT.isVector()) {
6943 VT,
6944 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6945 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6946
6947 unsigned AS = StoreNode->getAddressSpace();
6948 Align Alignment = StoreNode->getAlign();
6949 if (Alignment < MemVT.getStoreSize() &&
6950 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6951 StoreNode->getMemOperand()->getFlags(),
6952 nullptr)) {
6953 return scalarizeVectorStore(StoreNode, DAG);
6954 }
6955
6956 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6957 MemVT == MVT::v4i8) {
6958 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6959 }
6960 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6961 // the custom lowering, as there are no un-paired non-temporal stores and
6962 // legalization will break up 256 bit inputs.
6964 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6965 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6966 (MemVT.getScalarSizeInBits() == 8u ||
6967 MemVT.getScalarSizeInBits() == 16u ||
6968 MemVT.getScalarSizeInBits() == 32u ||
6969 MemVT.getScalarSizeInBits() == 64u)) {
6970 SDValue Lo =
6973 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6974 SDValue Hi =
6977 StoreNode->getValue(),
6978 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6980 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6981 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
6982 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
6983 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6984 return Result;
6985 }
6986 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6987 return LowerStore128(Op, DAG);
6988 } else if (MemVT == MVT::i64x8) {
6989 SDValue Value = StoreNode->getValue();
6990 assert(Value->getValueType(0) == MVT::i64x8);
6991 SDValue Chain = StoreNode->getChain();
6992 SDValue Base = StoreNode->getBasePtr();
6993 EVT PtrVT = Base.getValueType();
6994 for (unsigned i = 0; i < 8; i++) {
6995 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6996 Value, DAG.getConstant(i, Dl, MVT::i32));
6997 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6998 DAG.getConstant(i * 8, Dl, PtrVT));
6999 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7000 StoreNode->getBaseAlign());
7001 }
7002 return Chain;
7003 }
7004
7005 return SDValue();
7006}
7007
7008/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7009SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7010 SelectionDAG &DAG) const {
7011 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7012 assert(StoreNode->getMemoryVT() == MVT::i128);
7013 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7014
7015 bool IsStoreRelease =
7017 if (StoreNode->isAtomic())
7018 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7019 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7022
7023 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7024 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7025 ? StoreNode->getOperand(1)
7026 : StoreNode->getOperand(2);
7027 SDLoc DL(Op);
7028 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7029 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7030 if (DAG.getDataLayout().isBigEndian())
7031 std::swap(StoreValue.first, StoreValue.second);
7033 Opcode, DL, DAG.getVTList(MVT::Other),
7034 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7035 StoreNode->getBasePtr()},
7036 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7037 return Result;
7038}
7039
7040SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7041 SelectionDAG &DAG) const {
7042 SDLoc DL(Op);
7043 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7044 assert(LoadNode && "Expected custom lowering of a load node");
7045
7046 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7048 SDValue Base = LoadNode->getBasePtr();
7049 SDValue Chain = LoadNode->getChain();
7050 EVT PtrVT = Base.getValueType();
7051 for (unsigned i = 0; i < 8; i++) {
7052 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7053 DAG.getConstant(i * 8, DL, PtrVT));
7054 SDValue Part =
7055 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7056 LoadNode->getBaseAlign());
7057 Ops.push_back(Part);
7058 Chain = SDValue(Part.getNode(), 1);
7059 }
7060 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7061 return DAG.getMergeValues({Loaded, Chain}, DL);
7062 }
7063
7064 // Custom lowering for extending v4i8 vector loads.
7065 EVT VT = Op->getValueType(0);
7066 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7067
7068 if (LoadNode->getMemoryVT() != MVT::v4i8)
7069 return SDValue();
7070
7071 // Avoid generating unaligned loads.
7072 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7073 return SDValue();
7074
7075 unsigned ExtType;
7076 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7077 ExtType = ISD::SIGN_EXTEND;
7078 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7079 LoadNode->getExtensionType() == ISD::EXTLOAD)
7080 ExtType = ISD::ZERO_EXTEND;
7081 else
7082 return SDValue();
7083
7084 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7085 LoadNode->getBasePtr(), MachinePointerInfo());
7086 SDValue Chain = Load.getValue(1);
7087 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7088 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7089 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7090 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7091 DAG.getConstant(0, DL, MVT::i64));
7092 if (VT == MVT::v4i32)
7093 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7094 return DAG.getMergeValues({Ext, Chain}, DL);
7095}
7096
7097SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7098 SelectionDAG &DAG) const {
7099 SDLoc DL(Op);
7100 SDValue Vec = Op.getOperand(0);
7101 SDValue Mask = Op.getOperand(1);
7102 SDValue Passthru = Op.getOperand(2);
7103 EVT VecVT = Vec.getValueType();
7104 EVT MaskVT = Mask.getValueType();
7105 EVT ElmtVT = VecVT.getVectorElementType();
7106 const bool IsFixedLength = VecVT.isFixedLengthVector();
7107 const bool HasPassthru = !Passthru.isUndef();
7108 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7109 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7110
7111 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7112
7113 if (!Subtarget->isSVEAvailable())
7114 return SDValue();
7115
7116 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7117 return SDValue();
7118
7119 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7120 if (MinElmts != 2 && MinElmts != 4)
7121 return SDValue();
7122
7123 // We can use the SVE register containing the NEON vector in its lowest bits.
7124 if (IsFixedLength) {
7125 EVT ScalableVecVT =
7126 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7127 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7128 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7129
7130 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7131 DAG.getUNDEF(ScalableVecVT), Vec,
7132 DAG.getConstant(0, DL, MVT::i64));
7133 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7134 DAG.getUNDEF(ScalableMaskVT), Mask,
7135 DAG.getConstant(0, DL, MVT::i64));
7137 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7138 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7139 DAG.getUNDEF(ScalableVecVT), Passthru,
7140 DAG.getConstant(0, DL, MVT::i64));
7141
7142 VecVT = Vec.getValueType();
7143 MaskVT = Mask.getValueType();
7144 }
7145
7146 // Get legal type for compact instruction
7147 EVT ContainerVT = getSVEContainerType(VecVT);
7148 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7149
7150 // Convert to i32 or i64 for smaller types, as these are the only supported
7151 // sizes for compact.
7152 if (ContainerVT != VecVT) {
7153 Vec = DAG.getBitcast(CastVT, Vec);
7154 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7155 }
7156
7157 SDValue Compressed = DAG.getNode(
7159 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7160
7161 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7162 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7163 SDValue Offset = DAG.getNode(
7164 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7165 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7166
7167 SDValue IndexMask = DAG.getNode(
7168 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7169 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7170 DAG.getConstant(0, DL, MVT::i64), Offset);
7171
7172 Compressed =
7173 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7174 }
7175
7176 // Extracting from a legal SVE type before truncating produces better code.
7177 if (IsFixedLength) {
7178 Compressed = DAG.getNode(
7180 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7181 Compressed, DAG.getConstant(0, DL, MVT::i64));
7182 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7183 VecVT = FixedVecVT;
7184 }
7185
7186 // If we changed the element type before, we need to convert it back.
7187 if (ContainerVT != VecVT) {
7188 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7189 Compressed = DAG.getBitcast(VecVT, Compressed);
7190 }
7191
7192 return Compressed;
7193}
7194
7195// Generate SUBS and CSEL for integer abs.
7196SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7197 MVT VT = Op.getSimpleValueType();
7198
7199 if (VT.isVector())
7200 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7201
7202 SDLoc DL(Op);
7203 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7204 Op.getOperand(0));
7205 // Generate SUBS & CSEL.
7206 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7207 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7208 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7209 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7210}
7211
7213 SDValue Chain = Op.getOperand(0);
7214 SDValue Cond = Op.getOperand(1);
7215 SDValue Dest = Op.getOperand(2);
7216
7218 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7219 SDLoc DL(Op);
7220 SDValue CCVal = getCondCode(DAG, CC);
7221 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7222 Cmp);
7223 }
7224
7225 return SDValue();
7226}
7227
7228// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7229// FSHL is converted to FSHR before deciding what to do with it
7231 SDValue Shifts = Op.getOperand(2);
7232 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7233 // If opcode is FSHL, convert it to FSHR
7234 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7235 SDLoc DL(Op);
7236 MVT VT = Op.getSimpleValueType();
7237 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7238
7239 if (Op.getOpcode() == ISD::FSHL) {
7240 if (NewShiftNo == 0)
7241 return Op.getOperand(0);
7242
7243 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7244 return DAG.getNode(
7245 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7246 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7247 }
7248
7249 if (Op.getOpcode() == ISD::FSHR) {
7250 if (NewShiftNo == 0)
7251 return Op.getOperand(1);
7252
7253 if (ShiftNo->getZExtValue() == NewShiftNo)
7254 return Op;
7255
7256 // Rewrite using the normalised shift amount.
7257 return DAG.getNode(
7258 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7259 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7260 }
7261 }
7262
7263 return SDValue();
7264}
7265
7267 SDValue X = Op.getOperand(0);
7268 EVT XScalarTy = X.getValueType();
7269 SDValue Exp = Op.getOperand(1);
7270
7271 SDLoc DL(Op);
7272 EVT XVT, ExpVT;
7273 switch (Op.getSimpleValueType().SimpleTy) {
7274 default:
7275 return SDValue();
7276 case MVT::bf16:
7277 case MVT::f16:
7278 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7279 [[fallthrough]];
7280 case MVT::f32:
7281 XVT = MVT::nxv4f32;
7282 ExpVT = MVT::nxv4i32;
7283 break;
7284 case MVT::f64:
7285 XVT = MVT::nxv2f64;
7286 ExpVT = MVT::nxv2i64;
7287 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7288 break;
7289 }
7290
7291 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7292 SDValue VX =
7293 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7294 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7295 DAG.getUNDEF(ExpVT), Exp, Zero);
7296 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7297 AArch64SVEPredPattern::all);
7298 SDValue FScale =
7300 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7301 VPg, VX, VExp);
7302 SDValue Final =
7303 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7304 if (X.getValueType() != XScalarTy)
7305 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7306 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7307 return Final;
7308}
7309
7310SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7311 SelectionDAG &DAG) const {
7312 return Op.getOperand(0);
7313}
7314
7315SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7316 SelectionDAG &DAG) const {
7317 SDValue Chain = Op.getOperand(0);
7318 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7319 SDValue FPtr = Op.getOperand(2); // nested function
7320 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7321
7322 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7323
7324 // ldr NestReg, .+16
7325 // ldr x17, .+20
7326 // br x17
7327 // .word 0
7328 // .nest: .qword nest
7329 // .fptr: .qword fptr
7330 SDValue OutChains[5];
7331
7332 const Function *Func =
7333 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7334 CallingConv::ID CC = Func->getCallingConv();
7335 unsigned NestReg;
7336
7337 switch (CC) {
7338 default:
7339 NestReg = 0x0f; // X15
7340 break;
7342 // Must be kept in sync with AArch64CallingConv.td
7343 NestReg = 0x04; // X4
7344 break;
7345 }
7346
7347 const char FptrReg = 0x11; // X17
7348
7349 SDValue Addr = Trmp;
7350
7351 SDLoc DL(Op);
7352 OutChains[0] = DAG.getStore(
7353 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7354 MachinePointerInfo(TrmpAddr));
7355
7356 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7357 DAG.getConstant(4, DL, MVT::i64));
7358 OutChains[1] = DAG.getStore(
7359 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7360 MachinePointerInfo(TrmpAddr, 4));
7361
7362 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7363 DAG.getConstant(8, DL, MVT::i64));
7364 OutChains[2] =
7365 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7366 MachinePointerInfo(TrmpAddr, 8));
7367
7368 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7369 DAG.getConstant(16, DL, MVT::i64));
7370 OutChains[3] =
7371 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7372
7373 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7374 DAG.getConstant(24, DL, MVT::i64));
7375 OutChains[4] =
7376 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7377
7378 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7379
7380 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7381 DAG.getConstant(12, DL, MVT::i64));
7382
7383 // Call clear cache on the trampoline instructions.
7384 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7385 EndOfTrmp);
7386}
7387
7389 SelectionDAG &DAG) const {
7390 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7391 LLVM_DEBUG(Op.dump());
7392
7393 switch (Op.getOpcode()) {
7394 default:
7395 llvm_unreachable("unimplemented operand");
7396 return SDValue();
7397 case ISD::BITCAST:
7398 return LowerBITCAST(Op, DAG);
7399 case ISD::GlobalAddress:
7400 return LowerGlobalAddress(Op, DAG);
7402 return LowerGlobalTLSAddress(Op, DAG);
7404 return LowerPtrAuthGlobalAddress(Op, DAG);
7406 return LowerADJUST_TRAMPOLINE(Op, DAG);
7408 return LowerINIT_TRAMPOLINE(Op, DAG);
7409 case ISD::SETCC:
7410 case ISD::STRICT_FSETCC:
7412 return LowerSETCC(Op, DAG);
7413 case ISD::SETCCCARRY:
7414 return LowerSETCCCARRY(Op, DAG);
7415 case ISD::BRCOND:
7416 return LowerBRCOND(Op, DAG);
7417 case ISD::BR_CC:
7418 return LowerBR_CC(Op, DAG);
7419 case ISD::SELECT:
7420 return LowerSELECT(Op, DAG);
7421 case ISD::SELECT_CC:
7422 return LowerSELECT_CC(Op, DAG);
7423 case ISD::JumpTable:
7424 return LowerJumpTable(Op, DAG);
7425 case ISD::BR_JT:
7426 return LowerBR_JT(Op, DAG);
7427 case ISD::BRIND:
7428 return LowerBRIND(Op, DAG);
7429 case ISD::ConstantPool:
7430 return LowerConstantPool(Op, DAG);
7431 case ISD::BlockAddress:
7432 return LowerBlockAddress(Op, DAG);
7433 case ISD::VASTART:
7434 return LowerVASTART(Op, DAG);
7435 case ISD::VACOPY:
7436 return LowerVACOPY(Op, DAG);
7437 case ISD::VAARG:
7438 return LowerVAARG(Op, DAG);
7439 case ISD::UADDO_CARRY:
7440 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7441 case ISD::USUBO_CARRY:
7442 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7443 case ISD::SADDO_CARRY:
7444 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7445 case ISD::SSUBO_CARRY:
7446 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7447 case ISD::SADDO:
7448 case ISD::UADDO:
7449 case ISD::SSUBO:
7450 case ISD::USUBO:
7451 case ISD::SMULO:
7452 case ISD::UMULO:
7453 return LowerXALUO(Op, DAG);
7454 case ISD::FADD:
7455 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7456 case ISD::FSUB:
7457 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7458 case ISD::FMUL:
7459 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7460 case ISD::FMA:
7461 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7462 case ISD::FDIV:
7463 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7464 case ISD::FNEG:
7465 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7466 case ISD::FCEIL:
7467 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7468 case ISD::FFLOOR:
7469 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7470 case ISD::FNEARBYINT:
7471 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7472 case ISD::FRINT:
7473 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7474 case ISD::FROUND:
7475 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7476 case ISD::FROUNDEVEN:
7477 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7478 case ISD::FTRUNC:
7479 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7480 case ISD::FSQRT:
7481 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7482 case ISD::FABS:
7483 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7484 case ISD::FP_ROUND:
7486 return LowerFP_ROUND(Op, DAG);
7487 case ISD::FP_EXTEND:
7489 return LowerFP_EXTEND(Op, DAG);
7490 case ISD::FRAMEADDR:
7491 return LowerFRAMEADDR(Op, DAG);
7492 case ISD::SPONENTRY:
7493 return LowerSPONENTRY(Op, DAG);
7494 case ISD::RETURNADDR:
7495 return LowerRETURNADDR(Op, DAG);
7497 return LowerADDROFRETURNADDR(Op, DAG);
7499 return LowerCONCAT_VECTORS(Op, DAG);
7501 return LowerINSERT_VECTOR_ELT(Op, DAG);
7503 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7504 case ISD::BUILD_VECTOR:
7505 return LowerBUILD_VECTOR(Op, DAG);
7507 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7509 return LowerVECTOR_SHUFFLE(Op, DAG);
7510 case ISD::SPLAT_VECTOR:
7511 return LowerSPLAT_VECTOR(Op, DAG);
7513 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7515 return LowerINSERT_SUBVECTOR(Op, DAG);
7516 case ISD::SDIV:
7517 case ISD::UDIV:
7518 return LowerDIV(Op, DAG);
7519 case ISD::SMIN:
7520 case ISD::UMIN:
7521 case ISD::SMAX:
7522 case ISD::UMAX:
7523 return LowerMinMax(Op, DAG);
7524 case ISD::SRA:
7525 case ISD::SRL:
7526 case ISD::SHL:
7527 return LowerVectorSRA_SRL_SHL(Op, DAG);
7528 case ISD::SHL_PARTS:
7529 case ISD::SRL_PARTS:
7530 case ISD::SRA_PARTS:
7531 return LowerShiftParts(Op, DAG);
7532 case ISD::CTPOP:
7533 case ISD::PARITY:
7534 return LowerCTPOP_PARITY(Op, DAG);
7535 case ISD::FCOPYSIGN:
7536 return LowerFCOPYSIGN(Op, DAG);
7537 case ISD::OR:
7538 return LowerVectorOR(Op, DAG);
7539 case ISD::XOR:
7540 return LowerXOR(Op, DAG);
7541 case ISD::PREFETCH:
7542 return LowerPREFETCH(Op, DAG);
7543 case ISD::SINT_TO_FP:
7544 case ISD::UINT_TO_FP:
7547 return LowerINT_TO_FP(Op, DAG);
7548 case ISD::FP_TO_SINT:
7549 case ISD::FP_TO_UINT:
7552 return LowerFP_TO_INT(Op, DAG);
7555 return LowerFP_TO_INT_SAT(Op, DAG);
7556 case ISD::FSINCOS:
7557 return LowerFSINCOS(Op, DAG);
7558 case ISD::GET_ROUNDING:
7559 return LowerGET_ROUNDING(Op, DAG);
7560 case ISD::SET_ROUNDING:
7561 return LowerSET_ROUNDING(Op, DAG);
7562 case ISD::GET_FPMODE:
7563 return LowerGET_FPMODE(Op, DAG);
7564 case ISD::SET_FPMODE:
7565 return LowerSET_FPMODE(Op, DAG);
7566 case ISD::RESET_FPMODE:
7567 return LowerRESET_FPMODE(Op, DAG);
7568 case ISD::MUL:
7569 return LowerMUL(Op, DAG);
7570 case ISD::MULHS:
7571 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7572 case ISD::MULHU:
7573 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7575 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7577 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7579 return LowerINTRINSIC_VOID(Op, DAG);
7580 case ISD::ATOMIC_STORE:
7581 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7582 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7583 return LowerStore128(Op, DAG);
7584 }
7585 return SDValue();
7586 case ISD::STORE:
7587 return LowerSTORE(Op, DAG);
7588 case ISD::MSTORE:
7589 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7590 case ISD::MGATHER:
7591 return LowerMGATHER(Op, DAG);
7592 case ISD::MSCATTER:
7593 return LowerMSCATTER(Op, DAG);
7595 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7596 case ISD::VECREDUCE_ADD:
7597 case ISD::VECREDUCE_AND:
7598 case ISD::VECREDUCE_OR:
7599 case ISD::VECREDUCE_XOR:
7609 return LowerVECREDUCE(Op, DAG);
7611 return LowerATOMIC_LOAD_AND(Op, DAG);
7613 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7614 case ISD::VSCALE:
7615 return LowerVSCALE(Op, DAG);
7617 return LowerVECTOR_COMPRESS(Op, DAG);
7618 case ISD::ANY_EXTEND:
7619 case ISD::SIGN_EXTEND:
7620 case ISD::ZERO_EXTEND:
7621 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7622 case ISD::ADDRSPACECAST:
7623 return LowerADDRSPACECAST(Op, DAG);
7625 // Only custom lower when ExtraVT has a legal byte based element type.
7626 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7627 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7628 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7629 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7630 return SDValue();
7631
7632 return LowerToPredicatedOp(Op, DAG,
7633 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7634 }
7635 case ISD::TRUNCATE:
7636 return LowerTRUNCATE(Op, DAG);
7637 case ISD::MLOAD:
7638 return LowerMLOAD(Op, DAG);
7639 case ISD::LOAD:
7640 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7641 !Subtarget->isNeonAvailable()))
7642 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7643 return LowerLOAD(Op, DAG);
7644 case ISD::ADD:
7645 case ISD::AND:
7646 case ISD::SUB:
7647 return LowerToScalableOp(Op, DAG);
7648 case ISD::FMAXIMUM:
7649 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7650 case ISD::FMAXNUM:
7651 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7652 case ISD::FMINIMUM:
7653 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7654 case ISD::FMINNUM:
7655 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7656 case ISD::VSELECT:
7657 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7658 case ISD::ABS:
7659 return LowerABS(Op, DAG);
7660 case ISD::ABDS:
7661 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7662 case ISD::ABDU:
7663 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7664 case ISD::AVGFLOORS:
7665 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7666 case ISD::AVGFLOORU:
7667 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7668 case ISD::AVGCEILS:
7669 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7670 case ISD::AVGCEILU:
7671 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7672 case ISD::BITREVERSE:
7673 return LowerBitreverse(Op, DAG);
7674 case ISD::BSWAP:
7675 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7676 case ISD::CTLZ:
7677 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7678 case ISD::CTTZ:
7679 return LowerCTTZ(Op, DAG);
7680 case ISD::VECTOR_SPLICE:
7681 return LowerVECTOR_SPLICE(Op, DAG);
7683 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7685 return LowerVECTOR_INTERLEAVE(Op, DAG);
7687 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7688 case ISD::LRINT:
7689 case ISD::LLRINT:
7690 if (Op.getValueType().isVector())
7691 return LowerVectorXRINT(Op, DAG);
7692 [[fallthrough]];
7693 case ISD::LROUND:
7694 case ISD::LLROUND: {
7695 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7696 Op.getOperand(0).getValueType() == MVT::bf16) &&
7697 "Expected custom lowering of rounding operations only for f16");
7698 SDLoc DL(Op);
7699 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7700 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7701 }
7702 case ISD::STRICT_LROUND:
7704 case ISD::STRICT_LRINT:
7705 case ISD::STRICT_LLRINT: {
7706 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7707 Op.getOperand(1).getValueType() == MVT::bf16) &&
7708 "Expected custom lowering of rounding operations only for f16");
7709 SDLoc DL(Op);
7710 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7711 {Op.getOperand(0), Op.getOperand(1)});
7712 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7713 {Ext.getValue(1), Ext.getValue(0)});
7714 }
7715 case ISD::WRITE_REGISTER: {
7716 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7717 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7718 SDLoc DL(Op);
7719
7720 SDValue Chain = Op.getOperand(0);
7721 SDValue SysRegName = Op.getOperand(1);
7722 std::pair<SDValue, SDValue> Pair =
7723 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7724
7725 // chain = MSRR(chain, sysregname, lo, hi)
7726 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7727 SysRegName, Pair.first, Pair.second);
7728
7729 return Result;
7730 }
7731 case ISD::FSHL:
7732 case ISD::FSHR:
7733 return LowerFunnelShift(Op, DAG);
7734 case ISD::FLDEXP:
7735 return LowerFLDEXP(Op, DAG);
7737 return LowerVECTOR_HISTOGRAM(Op, DAG);
7741 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7742 }
7743}
7744
7746 return !Subtarget->useSVEForFixedLengthVectors();
7747}
7748
7750 EVT VT, bool OverrideNEON) const {
7751 if (!VT.isFixedLengthVector() || !VT.isSimple())
7752 return false;
7753
7754 // Don't use SVE for vectors we cannot scalarize if required.
7755 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7756 // Fixed length predicates should be promoted to i8.
7757 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7758 case MVT::i1:
7759 default:
7760 return false;
7761 case MVT::i8:
7762 case MVT::i16:
7763 case MVT::i32:
7764 case MVT::i64:
7765 case MVT::f16:
7766 case MVT::f32:
7767 case MVT::f64:
7768 break;
7769 }
7770
7771 // NEON-sized vectors can be emulated using SVE instructions.
7772 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7773 return Subtarget->isSVEorStreamingSVEAvailable();
7774
7775 // Ensure NEON MVTs only belong to a single register class.
7776 if (VT.getFixedSizeInBits() <= 128)
7777 return false;
7778
7779 // Ensure wider than NEON code generation is enabled.
7780 if (!Subtarget->useSVEForFixedLengthVectors())
7781 return false;
7782
7783 // Don't use SVE for types that don't fit.
7784 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7785 return false;
7786
7787 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7788 // the base fixed length SVE support in place.
7789 if (!VT.isPow2VectorType())
7790 return false;
7791
7792 return true;
7793}
7794
7795//===----------------------------------------------------------------------===//
7796// Calling Convention Implementation
7797//===----------------------------------------------------------------------===//
7798
7799static unsigned getIntrinsicID(const SDNode *N) {
7800 unsigned Opcode = N->getOpcode();
7801 switch (Opcode) {
7802 default:
7805 unsigned IID = N->getConstantOperandVal(0);
7806 if (IID < Intrinsic::num_intrinsics)
7807 return IID;
7809 }
7810 }
7811}
7812
7814 SDValue N1) const {
7815 if (!N0.hasOneUse())
7816 return false;
7817
7818 unsigned IID = getIntrinsicID(N1.getNode());
7819 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7820 if (IID == Intrinsic::aarch64_neon_umull ||
7821 N1.getOpcode() == AArch64ISD::UMULL ||
7822 IID == Intrinsic::aarch64_neon_smull ||
7823 N1.getOpcode() == AArch64ISD::SMULL)
7824 return N0.getOpcode() != ISD::ADD;
7825
7826 return true;
7827}
7828
7829/// Selects the correct CCAssignFn for a given CallingConvention value.
7831 bool IsVarArg) const {
7832 switch (CC) {
7833 default:
7834 reportFatalUsageError("unsupported calling convention");
7835 case CallingConv::GHC:
7836 return CC_AArch64_GHC;
7838 // The VarArg implementation makes assumptions about register
7839 // argument passing that do not hold for preserve_none, so we
7840 // instead fall back to C argument passing.
7841 // The non-vararg case is handled in the CC function itself.
7842 if (!IsVarArg)
7844 [[fallthrough]];
7845 case CallingConv::C:
7846 case CallingConv::Fast:
7850 case CallingConv::Swift:
7852 case CallingConv::Tail:
7853 case CallingConv::GRAAL:
7854 if (Subtarget->isTargetWindows()) {
7855 if (IsVarArg) {
7856 if (Subtarget->isWindowsArm64EC())
7859 }
7860 return CC_AArch64_Win64PCS;
7861 }
7862 if (!Subtarget->isTargetDarwin())
7863 return CC_AArch64_AAPCS;
7864 if (!IsVarArg)
7865 return CC_AArch64_DarwinPCS;
7868 case CallingConv::Win64:
7869 if (IsVarArg) {
7870 if (Subtarget->isWindowsArm64EC())
7873 }
7874 return CC_AArch64_Win64PCS;
7876 if (Subtarget->isWindowsArm64EC())
7884 return CC_AArch64_AAPCS;
7889 }
7890}
7891
7892CCAssignFn *
7894 switch (CC) {
7895 default:
7896 return RetCC_AArch64_AAPCS;
7900 if (Subtarget->isWindowsArm64EC())
7902 return RetCC_AArch64_AAPCS;
7903 }
7904}
7905
7906static bool isPassedInFPR(EVT VT) {
7907 return VT.isFixedLengthVector() ||
7908 (VT.isFloatingPoint() && !VT.isScalableVector());
7909}
7910
7911SDValue AArch64TargetLowering::LowerFormalArguments(
7912 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7913 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7914 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7916 const Function &F = MF.getFunction();
7917 MachineFrameInfo &MFI = MF.getFrameInfo();
7918 bool IsWin64 =
7919 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7920 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7921 (isVarArg && Subtarget->isWindowsArm64EC());
7923
7925 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7927 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7928 FuncInfo->setIsSVECC(true);
7929
7930 // Assign locations to all of the incoming arguments.
7932 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7933
7934 // At this point, Ins[].VT may already be promoted to i32. To correctly
7935 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7936 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7937 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7938 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7939 // LocVT.
7940 unsigned NumArgs = Ins.size();
7941 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7942 unsigned CurArgIdx = 0;
7943 bool UseVarArgCC = false;
7944 if (IsWin64)
7945 UseVarArgCC = isVarArg;
7946
7947 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7948
7949 for (unsigned i = 0; i != NumArgs; ++i) {
7950 MVT ValVT = Ins[i].VT;
7951 if (Ins[i].isOrigArg()) {
7952 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7953 CurArgIdx = Ins[i].getOrigArgIndex();
7954
7955 // Get type of the original argument.
7956 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7957 /*AllowUnknown*/ true);
7958 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7959 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7960 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7961 ValVT = MVT::i8;
7962 else if (ActualMVT == MVT::i16)
7963 ValVT = MVT::i16;
7964 }
7965 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
7966 Ins[i].OrigTy, CCInfo);
7967 assert(!Res && "Call operand has unhandled type");
7968 (void)Res;
7969 }
7970
7971 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
7972 bool IsLocallyStreaming =
7973 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7974 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7975 SDValue Glue = Chain.getValue(1);
7976
7977 unsigned ExtraArgLocs = 0;
7978 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7979 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7980
7981 if (Ins[i].Flags.isByVal()) {
7982 // Byval is used for HFAs in the PCS, but the system should work in a
7983 // non-compliant manner for larger structs.
7984 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7985 int Size = Ins[i].Flags.getByValSize();
7986 unsigned NumRegs = (Size + 7) / 8;
7987
7988 // FIXME: This works on big-endian for composite byvals, which are the common
7989 // case. It should also work for fundamental types too.
7990 unsigned FrameIdx =
7991 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7992 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7993 InVals.push_back(FrameIdxN);
7994
7995 continue;
7996 }
7997
7998 if (Ins[i].Flags.isSwiftAsync())
8000
8001 SDValue ArgValue;
8002 if (VA.isRegLoc()) {
8003 // Arguments stored in registers.
8004 EVT RegVT = VA.getLocVT();
8005 const TargetRegisterClass *RC;
8006
8007 if (RegVT == MVT::i32)
8008 RC = &AArch64::GPR32RegClass;
8009 else if (RegVT == MVT::i64)
8010 RC = &AArch64::GPR64RegClass;
8011 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8012 RC = &AArch64::FPR16RegClass;
8013 else if (RegVT == MVT::f32)
8014 RC = &AArch64::FPR32RegClass;
8015 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8016 RC = &AArch64::FPR64RegClass;
8017 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8018 RC = &AArch64::FPR128RegClass;
8019 else if (RegVT.isScalableVector() &&
8020 RegVT.getVectorElementType() == MVT::i1) {
8021 FuncInfo->setIsSVECC(true);
8022 RC = &AArch64::PPRRegClass;
8023 } else if (RegVT == MVT::aarch64svcount) {
8024 FuncInfo->setIsSVECC(true);
8025 RC = &AArch64::PPRRegClass;
8026 } else if (RegVT.isScalableVector()) {
8027 FuncInfo->setIsSVECC(true);
8028 RC = &AArch64::ZPRRegClass;
8029 } else
8030 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8031
8032 // Transform the arguments in physical registers into virtual ones.
8033 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8034
8035 if (IsLocallyStreaming) {
8036 // LocallyStreamingFunctions must insert the SMSTART in the correct
8037 // position, so we use Glue to ensure no instructions can be scheduled
8038 // between the chain of:
8039 // t0: ch,glue = EntryNode
8040 // t1: res,ch,glue = CopyFromReg
8041 // ...
8042 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8043 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8044 // ^^^^^^
8045 // This will be the new Chain/Root node.
8046 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8047 Glue = ArgValue.getValue(2);
8048 if (isPassedInFPR(ArgValue.getValueType())) {
8049 ArgValue =
8050 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8051 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8052 {ArgValue, Glue});
8053 Glue = ArgValue.getValue(1);
8054 }
8055 } else
8056 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8057
8058 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8059 // to 64 bits. Insert an assert[sz]ext to capture this, then
8060 // truncate to the right size.
8061 switch (VA.getLocInfo()) {
8062 default:
8063 llvm_unreachable("Unknown loc info!");
8064 case CCValAssign::Full:
8065 break;
8067 assert(
8068 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8069 "Indirect arguments should be scalable on most subtargets");
8070 break;
8071 case CCValAssign::BCvt:
8072 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8073 break;
8074 case CCValAssign::AExt:
8075 case CCValAssign::SExt:
8076 case CCValAssign::ZExt:
8077 break;
8079 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8080 DAG.getConstant(32, DL, RegVT));
8081 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8082 break;
8083 }
8084 } else { // VA.isRegLoc()
8085 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8086 unsigned ArgOffset = VA.getLocMemOffset();
8087 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8088 ? VA.getLocVT().getSizeInBits()
8089 : VA.getValVT().getSizeInBits()) / 8;
8090
8091 uint32_t BEAlign = 0;
8092 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8093 !Ins[i].Flags.isInConsecutiveRegs())
8094 BEAlign = 8 - ArgSize;
8095
8096 SDValue FIN;
8097 MachinePointerInfo PtrInfo;
8098 if (StackViaX4) {
8099 // In both the ARM64EC varargs convention and the thunk convention,
8100 // arguments on the stack are accessed relative to x4, not sp. In
8101 // the thunk convention, there's an additional offset of 32 bytes
8102 // to account for the shadow store.
8103 unsigned ObjOffset = ArgOffset + BEAlign;
8104 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8105 ObjOffset += 32;
8106 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8107 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8108 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8109 DAG.getConstant(ObjOffset, DL, MVT::i64));
8111 } else {
8112 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8113
8114 // Create load nodes to retrieve arguments from the stack.
8115 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8116 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8117 }
8118
8119 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8121 MVT MemVT = VA.getValVT();
8122
8123 switch (VA.getLocInfo()) {
8124 default:
8125 break;
8126 case CCValAssign::Trunc:
8127 case CCValAssign::BCvt:
8128 MemVT = VA.getLocVT();
8129 break;
8132 Subtarget->isWindowsArm64EC()) &&
8133 "Indirect arguments should be scalable on most subtargets");
8134 MemVT = VA.getLocVT();
8135 break;
8136 case CCValAssign::SExt:
8137 ExtType = ISD::SEXTLOAD;
8138 break;
8139 case CCValAssign::ZExt:
8140 ExtType = ISD::ZEXTLOAD;
8141 break;
8142 case CCValAssign::AExt:
8143 ExtType = ISD::EXTLOAD;
8144 break;
8145 }
8146
8147 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8148 MemVT);
8149 }
8150
8151 if (VA.getLocInfo() == CCValAssign::Indirect) {
8152 assert((VA.getValVT().isScalableVT() ||
8153 Subtarget->isWindowsArm64EC()) &&
8154 "Indirect arguments should be scalable on most subtargets");
8155
8156 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8157 unsigned NumParts = 1;
8158 if (Ins[i].Flags.isInConsecutiveRegs()) {
8159 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8160 ++NumParts;
8161 }
8162
8163 MVT PartLoad = VA.getValVT();
8164 SDValue Ptr = ArgValue;
8165
8166 // Ensure we generate all loads for each tuple part, whilst updating the
8167 // pointer after each load correctly using vscale.
8168 while (NumParts > 0) {
8169 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8170 InVals.push_back(ArgValue);
8171 NumParts--;
8172 if (NumParts > 0) {
8173 SDValue BytesIncrement;
8174 if (PartLoad.isScalableVector()) {
8175 BytesIncrement = DAG.getVScale(
8176 DL, Ptr.getValueType(),
8177 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8178 } else {
8179 BytesIncrement = DAG.getConstant(
8180 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8181 Ptr.getValueType());
8182 }
8183 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8184 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8185 ExtraArgLocs++;
8186 i++;
8187 }
8188 }
8189 } else {
8190 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8191 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8192 ArgValue, DAG.getValueType(MVT::i32));
8193
8194 // i1 arguments are zero-extended to i8 by the caller. Emit a
8195 // hint to reflect this.
8196 if (Ins[i].isOrigArg()) {
8197 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8198 if (OrigArg->getType()->isIntegerTy(1)) {
8199 if (!Ins[i].Flags.isZExt()) {
8200 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8201 ArgValue.getValueType(), ArgValue);
8202 }
8203 }
8204 }
8205
8206 InVals.push_back(ArgValue);
8207 }
8208 }
8209 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8210
8211 if (Attrs.hasStreamingCompatibleInterface()) {
8212 SDValue EntryPStateSM =
8213 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8214 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8215
8216 // Copy the value to a virtual register, and save that in FuncInfo.
8217 Register EntryPStateSMReg =
8218 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8219 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8220 EntryPStateSM);
8221 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8222 }
8223
8224 // Insert the SMSTART if this is a locally streaming function and
8225 // make sure it is Glued to the last CopyFromReg value.
8226 if (IsLocallyStreaming) {
8227 if (Attrs.hasStreamingCompatibleInterface())
8228 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8230 else
8231 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8233
8234 // Ensure that the SMSTART happens after the CopyWithChain such that its
8235 // chain result is used.
8236 for (unsigned I=0; I<InVals.size(); ++I) {
8239 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8240 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8241 InVals[I].getValueType());
8242 }
8243 }
8244
8245 // varargs
8246 if (isVarArg) {
8248 if (!Subtarget->isTargetDarwin() || IsWin64) {
8249 // The AAPCS variadic function ABI is identical to the non-variadic
8250 // one. As a result there may be more arguments in registers and we
8251 // should save them for future reference.
8252 // Win64 variadic functions also pass arguments in registers, but all
8253 // float arguments are passed in integer registers.
8254 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8255 }
8256
8257 // This will point to the next argument passed via stack.
8258 unsigned VarArgsOffset = CCInfo.getStackSize();
8259 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8260 VarArgsOffset =
8261 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8262 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8263 FuncInfo->setVarArgsStackIndex(
8264 MFI.CreateFixedObject(4, VarArgsOffset, true));
8265 }
8266
8267 if (MFI.hasMustTailInVarArgFunc()) {
8268 SmallVector<MVT, 2> RegParmTypes;
8269 RegParmTypes.push_back(MVT::i64);
8270 RegParmTypes.push_back(MVT::f128);
8271 // Compute the set of forwarded registers. The rest are scratch.
8273 FuncInfo->getForwardedMustTailRegParms();
8274 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8276
8277 // Conservatively forward X8, since it might be used for aggregate return.
8278 if (!CCInfo.isAllocated(AArch64::X8)) {
8279 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8280 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8281 }
8282 }
8283 }
8284
8285 // On Windows, InReg pointers must be returned, so record the pointer in a
8286 // virtual register at the start of the function so it can be returned in the
8287 // epilogue.
8288 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8289 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8290 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8291 Ins[I].Flags.isInReg()) &&
8292 Ins[I].Flags.isSRet()) {
8293 assert(!FuncInfo->getSRetReturnReg());
8294
8295 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8296 Register Reg =
8298 FuncInfo->setSRetReturnReg(Reg);
8299
8300 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8301 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8302 break;
8303 }
8304 }
8305 }
8306
8307 unsigned StackArgSize = CCInfo.getStackSize();
8308 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8309 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8310 // This is a non-standard ABI so by fiat I say we're allowed to make full
8311 // use of the stack area to be popped, which must be aligned to 16 bytes in
8312 // any case:
8313 StackArgSize = alignTo(StackArgSize, 16);
8314
8315 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8316 // a multiple of 16.
8317 FuncInfo->setArgumentStackToRestore(StackArgSize);
8318
8319 // This realignment carries over to the available bytes below. Our own
8320 // callers will guarantee the space is free by giving an aligned value to
8321 // CALLSEQ_START.
8322 }
8323 // Even if we're not expected to free up the space, it's useful to know how
8324 // much is there while considering tail calls (because we can reuse it).
8325 FuncInfo->setBytesInStackArgArea(StackArgSize);
8326
8327 if (Subtarget->hasCustomCallingConv())
8329
8330 if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
8331 // Old SME ABI lowering (deprecated):
8332 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8333 // will be expanded and stored in the static object later using a
8334 // pseudonode.
8335 if (Attrs.hasZAState()) {
8336 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8337 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8338 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8339 DAG.getConstant(1, DL, MVT::i32));
8340 SDValue Buffer;
8341 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8342 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8343 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8344 } else {
8345 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8346 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8347 DAG.getVTList(MVT::i64, MVT::Other),
8348 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8349 MFI.CreateVariableSizedObject(Align(16), nullptr);
8350 }
8351 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8352 DAG.getConstant(1, DL, MVT::i32));
8353 Chain = DAG.getNode(
8354 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8355 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8356 /*Num save slices*/ NumZaSaveSlices});
8357 } else if (Attrs.hasAgnosticZAInterface()) {
8358 // Call __arm_sme_state_size().
8359 SDValue BufferSize =
8360 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8361 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8362 Chain = BufferSize.getValue(1);
8363 SDValue Buffer;
8364 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8365 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8366 DAG.getVTList(MVT::i64, MVT::Other),
8367 {Chain, BufferSize});
8368 } else {
8369 // Allocate space dynamically.
8370 Buffer = DAG.getNode(
8371 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8372 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8373 MFI.CreateVariableSizedObject(Align(16), nullptr);
8374 }
8375 // Copy the value to a virtual register, and save that in FuncInfo.
8376 Register BufferPtr =
8377 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8378 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8379 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8380 }
8381 }
8382
8383 if (CallConv == CallingConv::PreserveNone) {
8384 for (const ISD::InputArg &I : Ins) {
8385 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8386 I.Flags.isSwiftAsync()) {
8389 MF.getFunction(),
8390 "Swift attributes can't be used with preserve_none",
8391 DL.getDebugLoc()));
8392 break;
8393 }
8394 }
8395 }
8396
8397 if (getTM().useNewSMEABILowering()) {
8398 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8399 if (Attrs.isNewZT0())
8400 Chain = DAG.getNode(
8401 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8402 DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8403 DAG.getTargetConstant(0, DL, MVT::i32));
8404 }
8405
8406 return Chain;
8407}
8408
8409void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8410 SelectionDAG &DAG,
8411 const SDLoc &DL,
8412 SDValue &Chain) const {
8414 MachineFrameInfo &MFI = MF.getFrameInfo();
8416 auto PtrVT = getPointerTy(DAG.getDataLayout());
8417 Function &F = MF.getFunction();
8418 bool IsWin64 =
8419 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8420
8422
8424 unsigned NumGPRArgRegs = GPRArgRegs.size();
8425 if (Subtarget->isWindowsArm64EC()) {
8426 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8427 // functions.
8428 NumGPRArgRegs = 4;
8429 }
8430 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8431
8432 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8433 int GPRIdx = 0;
8434 if (GPRSaveSize != 0) {
8435 if (IsWin64) {
8436 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8437 if (GPRSaveSize & 15)
8438 // The extra size here, if triggered, will always be 8.
8439 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8440 } else
8441 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8442
8443 SDValue FIN;
8444 if (Subtarget->isWindowsArm64EC()) {
8445 // With the Arm64EC ABI, we reserve the save area as usual, but we
8446 // compute its address relative to x4. For a normal AArch64->AArch64
8447 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8448 // different address.
8449 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8450 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8451 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8452 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8453 } else {
8454 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8455 }
8456
8457 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8458 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8459 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8460 SDValue Store =
8461 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8463 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8464 : MachinePointerInfo::getStack(MF, i * 8));
8465 MemOps.push_back(Store);
8466 FIN =
8467 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8468 }
8469 }
8470 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8471 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8472
8473 if (Subtarget->hasFPARMv8() && !IsWin64) {
8475 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8476 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8477
8478 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8479 int FPRIdx = 0;
8480 if (FPRSaveSize != 0) {
8481 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8482
8483 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8484
8485 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8486 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8487 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8488
8489 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8490 MachinePointerInfo::getStack(MF, i * 16));
8491 MemOps.push_back(Store);
8492 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8493 DAG.getConstant(16, DL, PtrVT));
8494 }
8495 }
8496 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8497 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8498 }
8499
8500 if (!MemOps.empty()) {
8501 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8502 }
8503}
8504
8505/// LowerCallResult - Lower the result values of a call into the
8506/// appropriate copies out of appropriate physical registers.
8507SDValue AArch64TargetLowering::LowerCallResult(
8508 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8509 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8510 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8511 SDValue ThisVal, bool RequiresSMChange) const {
8512 DenseMap<unsigned, SDValue> CopiedRegs;
8513 // Copy all of the result registers out of their specified physreg.
8514 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8515 CCValAssign VA = RVLocs[i];
8516
8517 // Pass 'this' value directly from the argument to return value, to avoid
8518 // reg unit interference
8519 if (i == 0 && isThisReturn) {
8520 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8521 "unexpected return calling convention register assignment");
8522 InVals.push_back(ThisVal);
8523 continue;
8524 }
8525
8526 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8527 // allows one use of a physreg per block.
8528 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8529 if (!Val) {
8530 Val =
8531 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8532 Chain = Val.getValue(1);
8533 InGlue = Val.getValue(2);
8534 CopiedRegs[VA.getLocReg()] = Val;
8535 }
8536
8537 switch (VA.getLocInfo()) {
8538 default:
8539 llvm_unreachable("Unknown loc info!");
8540 case CCValAssign::Full:
8541 break;
8542 case CCValAssign::BCvt:
8543 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8544 break;
8546 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8547 DAG.getConstant(32, DL, VA.getLocVT()));
8548 [[fallthrough]];
8549 case CCValAssign::AExt:
8550 [[fallthrough]];
8551 case CCValAssign::ZExt:
8552 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8553 break;
8554 }
8555
8556 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8557 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8558 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8559
8560 InVals.push_back(Val);
8561 }
8562
8563 return Chain;
8564}
8565
8566/// Return true if the calling convention is one that we can guarantee TCO for.
8567static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8568 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8570}
8571
8572/// Return true if we might ever do TCO for calls with this calling convention.
8574 switch (CC) {
8575 case CallingConv::C:
8580 case CallingConv::Swift:
8582 case CallingConv::Tail:
8583 case CallingConv::Fast:
8584 return true;
8585 default:
8586 return false;
8587 }
8588}
8589
8590/// Return true if the call convention supports varargs
8591/// Currently only those that pass varargs like the C
8592/// calling convention does are eligible
8593/// Calling conventions listed in this function must also
8594/// be properly handled in AArch64Subtarget::isCallingConvWin64
8596 switch (CC) {
8597 case CallingConv::C:
8599 // SVE vector call is only partially supported, but it should
8600 // support named arguments being passed. Any arguments being passed
8601 // as varargs, are still unsupported.
8603 return true;
8604 default:
8605 return false;
8606 }
8607}
8608
8610 const AArch64Subtarget *Subtarget,
8612 CCState &CCInfo) {
8613 const SelectionDAG &DAG = CLI.DAG;
8614 CallingConv::ID CalleeCC = CLI.CallConv;
8615 bool IsVarArg = CLI.IsVarArg;
8616 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8617 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8618
8619 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8620 // for the shadow store.
8621 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8622 CCInfo.AllocateStack(32, Align(16));
8623
8624 unsigned NumArgs = Outs.size();
8625 for (unsigned i = 0; i != NumArgs; ++i) {
8626 MVT ArgVT = Outs[i].VT;
8627 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8628
8629 bool UseVarArgCC = false;
8630 if (IsVarArg) {
8631 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8632 // too, so use the vararg CC to force them to integer registers.
8633 if (IsCalleeWin64) {
8634 UseVarArgCC = true;
8635 } else {
8636 UseVarArgCC = ArgFlags.isVarArg();
8637 }
8638 }
8639
8640 if (!UseVarArgCC) {
8641 // Get type of the original argument.
8642 EVT ActualVT =
8643 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8644 /*AllowUnknown*/ true);
8645 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8646 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8647 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8648 ArgVT = MVT::i8;
8649 else if (ActualMVT == MVT::i16)
8650 ArgVT = MVT::i16;
8651 }
8652
8653 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8654 // argument. This logic should exactly mirror LowerFormalArguments.
8655 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8656 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8657 Outs[i].OrigTy, CCInfo);
8658 assert(!Res && "Call operand has unhandled type");
8659 (void)Res;
8660 }
8661}
8662
8663static SMECallAttrs
8666 if (CLI.CB)
8667 return SMECallAttrs(*CLI.CB, &TLI);
8668 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8669 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
8671}
8672
8673bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8674 const CallLoweringInfo &CLI) const {
8675 CallingConv::ID CalleeCC = CLI.CallConv;
8676 if (!mayTailCallThisCC(CalleeCC))
8677 return false;
8678
8679 SDValue Callee = CLI.Callee;
8680 bool IsVarArg = CLI.IsVarArg;
8681 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8682 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8683 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8684 const SelectionDAG &DAG = CLI.DAG;
8686 const Function &CallerF = MF.getFunction();
8687 CallingConv::ID CallerCC = CallerF.getCallingConv();
8688
8689 // SME Streaming functions are not eligible for TCO as they may require
8690 // the streaming mode or ZA to be restored after returning from the call.
8691 SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
8692 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
8693 CallAttrs.requiresPreservingAllZAState() ||
8694 CallAttrs.caller().hasStreamingBody())
8695 return false;
8696
8697 // Functions using the C or Fast calling convention that have an SVE signature
8698 // preserve more registers and should assume the SVE_VectorCall CC.
8699 // The check for matching callee-saved regs will determine whether it is
8700 // eligible for TCO.
8701 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8704
8705 bool CCMatch = CallerCC == CalleeCC;
8706
8707 // When using the Windows calling convention on a non-windows OS, we want
8708 // to back up and restore X18 in such functions; we can't do a tail call
8709 // from those functions.
8710 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8711 CalleeCC != CallingConv::Win64)
8712 return false;
8713
8714 // Byval parameters hand the function a pointer directly into the stack area
8715 // we want to reuse during a tail call. Working around this *is* possible (see
8716 // X86) but less efficient and uglier in LowerCall.
8717 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8718 e = CallerF.arg_end();
8719 i != e; ++i) {
8720 if (i->hasByValAttr())
8721 return false;
8722
8723 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8724 // In this case, it is necessary to save X0/X1 in the callee and return it
8725 // in X0. Tail call opt may interfere with this, so we disable tail call
8726 // opt when the caller has an "inreg" attribute -- except if the callee
8727 // also has that attribute on the same argument, and the same value is
8728 // passed.
8729 if (i->hasInRegAttr()) {
8730 unsigned ArgIdx = i - CallerF.arg_begin();
8731 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
8732 return false;
8733 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
8734 if (!Attrs.hasAttribute(Attribute::InReg) ||
8735 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
8736 CLI.CB->getArgOperand(ArgIdx) != i) {
8737 return false;
8738 }
8739 }
8740 }
8741
8742 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8743 return CCMatch;
8744
8745 // Externally-defined functions with weak linkage should not be
8746 // tail-called on AArch64 when the OS does not support dynamic
8747 // pre-emption of symbols, as the AAELF spec requires normal calls
8748 // to undefined weak functions to be replaced with a NOP or jump to the
8749 // next instruction. The behaviour of branch instructions in this
8750 // situation (as used for tail calls) is implementation-defined, so we
8751 // cannot rely on the linker replacing the tail call with a return.
8752 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8753 const GlobalValue *GV = G->getGlobal();
8755 if (GV->hasExternalWeakLinkage() &&
8756 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8757 return false;
8758 }
8759
8760 // Now we search for cases where we can use a tail call without changing the
8761 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8762 // concept.
8763
8764 // I want anyone implementing a new calling convention to think long and hard
8765 // about this assert.
8766 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8767 report_fatal_error("Unsupported variadic calling convention");
8768
8769 LLVMContext &C = *DAG.getContext();
8770 // Check that the call results are passed in the same way.
8771 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8772 CCAssignFnForCall(CalleeCC, IsVarArg),
8773 CCAssignFnForCall(CallerCC, IsVarArg)))
8774 return false;
8775 // The callee has to preserve all registers the caller needs to preserve.
8776 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8777 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8778 if (!CCMatch) {
8779 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8780 if (Subtarget->hasCustomCallingConv()) {
8781 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8782 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8783 }
8784 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8785 return false;
8786 }
8787
8788 // Nothing more to check if the callee is taking no arguments
8789 if (Outs.empty())
8790 return true;
8791
8793 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8794
8795 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8796
8797 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8798 // When we are musttail, additional checks have been done and we can safely ignore this check
8799 // At least two cases here: if caller is fastcc then we can't have any
8800 // memory arguments (we'd be expected to clean up the stack afterwards). If
8801 // caller is C then we could potentially use its argument area.
8802
8803 // FIXME: for now we take the most conservative of these in both cases:
8804 // disallow all variadic memory operands.
8805 for (const CCValAssign &ArgLoc : ArgLocs)
8806 if (!ArgLoc.isRegLoc())
8807 return false;
8808 }
8809
8810 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8811
8812 // If any of the arguments is passed indirectly, it must be SVE, so the
8813 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8814 // allocate space on the stack. That is why we determine this explicitly here
8815 // the call cannot be a tailcall.
8816 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8817 assert((A.getLocInfo() != CCValAssign::Indirect ||
8818 A.getValVT().isScalableVector() ||
8819 Subtarget->isWindowsArm64EC()) &&
8820 "Expected value to be scalable");
8821 return A.getLocInfo() == CCValAssign::Indirect;
8822 }))
8823 return false;
8824
8825 // If the stack arguments for this call do not fit into our own save area then
8826 // the call cannot be made tail.
8827 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8828 return false;
8829
8830 const MachineRegisterInfo &MRI = MF.getRegInfo();
8831 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8832 return false;
8833
8834 return true;
8835}
8836
8837SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8838 SelectionDAG &DAG,
8839 MachineFrameInfo &MFI,
8840 int ClobberedFI) const {
8841 SmallVector<SDValue, 8> ArgChains;
8842 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8843 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8844
8845 // Include the original chain at the beginning of the list. When this is
8846 // used by target LowerCall hooks, this helps legalize find the
8847 // CALLSEQ_BEGIN node.
8848 ArgChains.push_back(Chain);
8849
8850 // Add a chain value for each stack argument corresponding
8851 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8852 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8853 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8854 if (FI->getIndex() < 0) {
8855 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8856 int64_t InLastByte = InFirstByte;
8857 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8858
8859 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8860 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8861 ArgChains.push_back(SDValue(L, 1));
8862 }
8863
8864 // Build a tokenfactor for all the chains.
8865 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8866}
8867
8868bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8869 bool TailCallOpt) const {
8870 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8871 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8872}
8873
8874// Check if the value is zero-extended from i1 to i8
8875static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8876 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8877 if (SizeInBits < 8)
8878 return false;
8879
8880 APInt RequiredZero(SizeInBits, 0xFE);
8881 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8882 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
8883 return ZExtBool;
8884}
8885
8886void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8887 SDNode *Node) const {
8888 // Live-in physreg copies that are glued to SMSTART are applied as
8889 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8890 // register allocator to pass call args in callee saved regs, without extra
8891 // copies to avoid these fake clobbers of actually-preserved GPRs.
8892 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8893 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8894 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8895 if (MachineOperand &MO = MI.getOperand(I);
8896 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8897 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8898 AArch64::GPR64RegClass.contains(MO.getReg())))
8899 MI.removeOperand(I);
8900
8901 // The SVE vector length can change when entering/leaving streaming mode.
8902 // FPMR is set to 0 when entering/leaving streaming mode.
8903 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8904 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8905 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8906 /*IsImplicit=*/true));
8907 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8908 /*IsImplicit=*/true));
8909 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
8910 /*IsImplicit=*/true));
8911 }
8912 }
8913
8914 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8915 // have nothing to do with VG, were it not that they are used to materialise a
8916 // frame-address. If they contain a frame-index to a scalable vector, this
8917 // will likely require an ADDVL instruction to materialise the address, thus
8918 // reading VG.
8919 const MachineFunction &MF = *MI.getMF();
8921 (MI.getOpcode() == AArch64::ADDXri ||
8922 MI.getOpcode() == AArch64::SUBXri)) {
8923 const MachineOperand &MO = MI.getOperand(1);
8924 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8926 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8927 /*IsImplicit=*/true));
8928 }
8929}
8930
8932 bool Enable, SDValue Chain,
8933 SDValue InGlue,
8934 unsigned Condition) const {
8937 FuncInfo->setHasStreamingModeChanges(true);
8938
8939 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8940 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8941 SDValue MSROp =
8942 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8943 SmallVector<SDValue> Ops = {Chain, MSROp};
8944 unsigned Opcode;
8945 if (Condition != AArch64SME::Always) {
8946 FuncInfo->setPStateSMRegUsed(true);
8947 Register PStateReg = FuncInfo->getPStateSMReg();
8948 assert(PStateReg.isValid() && "PStateSM Register is invalid");
8949 SDValue PStateSM =
8950 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
8951 // Use chain and glue from the CopyFromReg.
8952 Ops[0] = PStateSM.getValue(1);
8953 InGlue = PStateSM.getValue(2);
8954 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8955 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
8956 Ops.push_back(ConditionOp);
8957 Ops.push_back(PStateSM);
8958 } else {
8959 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8960 }
8961 Ops.push_back(RegMask);
8962
8963 if (InGlue)
8964 Ops.push_back(InGlue);
8965
8966 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8967}
8968
8969// Emit a call to __arm_sme_save or __arm_sme_restore.
8971 SelectionDAG &DAG,
8973 SDValue Chain, bool IsSave) {
8976 FuncInfo->setSMESaveBufferUsed();
8978 Args.emplace_back(
8979 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8981
8982 RTLIB::Libcall LC =
8983 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8984 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
8985 TLI.getPointerTy(DAG.getDataLayout()));
8986 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8988 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8989 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
8990 return TLI.LowerCallTo(CLI).second;
8991}
8992
8995 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
8996 CallAttrs.caller().hasStreamingBody())
8997 return AArch64SME::Always;
8998 if (CallAttrs.callee().hasNonStreamingInterface())
9000 if (CallAttrs.callee().hasStreamingInterface())
9002
9003 llvm_unreachable("Unsupported attributes");
9004}
9005
9006/// Check whether a stack argument requires lowering in a tail call.
9008 const CCValAssign &VA, SDValue Arg,
9009 ISD::ArgFlagsTy Flags, int CallOffset) {
9010 // FIXME: We should be able to handle this case, but it's not clear how to.
9011 if (Flags.isZExt() || Flags.isSExt())
9012 return true;
9013
9014 for (;;) {
9015 // Look through nodes that don't alter the bits of the incoming value.
9016 unsigned Op = Arg.getOpcode();
9017 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9018 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9019 Arg = Arg.getOperand(0);
9020 continue;
9021 }
9022 break;
9023 }
9024
9025 // If the argument is a load from the same immutable stack slot, we can reuse
9026 // it.
9027 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9028 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9029 const MachineFrameInfo &MFI = MF.getFrameInfo();
9030 int FI = FINode->getIndex();
9031 if (!MFI.isImmutableObjectIndex(FI))
9032 return true;
9033 if (CallOffset != MFI.getObjectOffset(FI))
9034 return true;
9035 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9036 if (SizeInBits / 8 != MFI.getObjectSize(FI))
9037 return true;
9038 return false;
9039 }
9040 }
9041
9042 return true;
9043}
9044
9045/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9046/// and add input and output parameter nodes.
9047SDValue
9048AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9049 SmallVectorImpl<SDValue> &InVals) const {
9050 SelectionDAG &DAG = CLI.DAG;
9051 SDLoc &DL = CLI.DL;
9052 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9053 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9055 SDValue Chain = CLI.Chain;
9056 SDValue Callee = CLI.Callee;
9057 bool &IsTailCall = CLI.IsTailCall;
9058 CallingConv::ID &CallConv = CLI.CallConv;
9059 bool IsVarArg = CLI.IsVarArg;
9060 const CallBase *CB = CLI.CB;
9061
9064 bool IsThisReturn = false;
9065
9067 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9068 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9069 bool IsSibCall = false;
9070 bool GuardWithBTI = false;
9071
9072 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9073 !Subtarget->noBTIAtReturnTwice()) {
9074 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9075 }
9076
9077 // Analyze operands of the call, assigning locations to each operand.
9079 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9080
9081 if (IsVarArg) {
9082 unsigned NumArgs = Outs.size();
9083
9084 for (unsigned i = 0; i != NumArgs; ++i) {
9085 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9086 report_fatal_error("Passing SVE types to variadic functions is "
9087 "currently not supported");
9088 }
9089 }
9090
9091 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9092
9093 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9094 // Assign locations to each value returned by this call.
9096 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9097 *DAG.getContext());
9098 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9099
9100 // Set type id for call site info.
9101 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9102 CSInfo = MachineFunction::CallSiteInfo(*CB);
9103
9104 // Check callee args/returns for SVE registers and set calling convention
9105 // accordingly.
9106 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9107 auto HasSVERegLoc = [](CCValAssign &Loc) {
9108 if (!Loc.isRegLoc())
9109 return false;
9110 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9111 AArch64::PPRRegClass.contains(Loc.getLocReg());
9112 };
9113 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9115 }
9116
9117 // Determine whether we need any streaming mode changes.
9118 SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
9119 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9120 bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
9121 auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
9122 // TODO: Handle agnostic ZA functions.
9123 if (!UseNewSMEABILowering || IsAgnosticZAFunction)
9124 return std::nullopt;
9125 if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
9126 return std::nullopt;
9127 return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
9128 : AArch64ISD::INOUT_ZA_USE;
9129 }();
9130
9131 if (IsTailCall) {
9132 // Check if it's really possible to do a tail call.
9133 IsTailCall = isEligibleForTailCallOptimization(CLI);
9134
9135 // A sibling call is one where we're under the usual C ABI and not planning
9136 // to change that but can still do a tail call:
9137 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9138 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9139 IsSibCall = true;
9140
9141 if (IsTailCall)
9142 ++NumTailCalls;
9143 }
9144
9145 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9146 report_fatal_error("failed to perform tail call elimination on a call "
9147 "site marked musttail");
9148
9149 // Get a count of how many bytes are to be pushed on the stack.
9150 unsigned NumBytes = CCInfo.getStackSize();
9151
9152 if (IsSibCall) {
9153 // Since we're not changing the ABI to make this a tail call, the memory
9154 // operands are already available in the caller's incoming argument space.
9155 NumBytes = 0;
9156 }
9157
9158 // FPDiff is the byte offset of the call's argument area from the callee's.
9159 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9160 // by this amount for a tail call. In a sibling call it must be 0 because the
9161 // caller will deallocate the entire stack and the callee still expects its
9162 // arguments to begin at SP+0. Completely unused for non-tail calls.
9163 int FPDiff = 0;
9164
9165 if (IsTailCall && !IsSibCall) {
9166 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9167
9168 // Since callee will pop argument stack as a tail call, we must keep the
9169 // popped size 16-byte aligned.
9170 NumBytes = alignTo(NumBytes, 16);
9171
9172 // FPDiff will be negative if this tail call requires more space than we
9173 // would automatically have in our incoming argument space. Positive if we
9174 // can actually shrink the stack.
9175 FPDiff = NumReusableBytes - NumBytes;
9176
9177 // Update the required reserved area if this is the tail call requiring the
9178 // most argument stack space.
9179 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9180 FuncInfo->setTailCallReservedStack(-FPDiff);
9181
9182 // The stack pointer must be 16-byte aligned at all times it's used for a
9183 // memory operation, which in practice means at *all* times and in
9184 // particular across call boundaries. Therefore our own arguments started at
9185 // a 16-byte aligned SP and the delta applied for the tail call should
9186 // satisfy the same constraint.
9187 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9188 }
9189
9190 auto DescribeCallsite =
9192 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9193 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9194 R << ore::NV("Callee", ES->getSymbol());
9195 else if (CLI.CB && CLI.CB->getCalledFunction())
9196 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9197 else
9198 R << "unknown callee";
9199 R << "'";
9200 return R;
9201 };
9202
9203 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9204 bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
9205 if (RequiresLazySave) {
9206 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9207 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9208 TPIDR2.FrameIndex,
9210 Chain = DAG.getNode(
9211 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9212 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9213 TPIDR2ObjAddr);
9215 ORE.emit([&]() {
9216 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9217 CLI.CB)
9218 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9219 &MF.getFunction());
9220 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9221 });
9222 } else if (RequiresSaveAllZA) {
9223 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9224 "Cannot share state that may not exist");
9225 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9226 /*IsSave=*/true);
9227 }
9228
9229 bool RequiresSMChange = CallAttrs.requiresSMChange();
9230 if (RequiresSMChange) {
9232 ORE.emit([&]() {
9233 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9234 CLI.CB)
9235 : OptimizationRemarkAnalysis("sme", "SMETransition",
9236 &MF.getFunction());
9237 DescribeCallsite(R) << " requires a streaming mode transition";
9238 return R;
9239 });
9240 }
9241
9242 SDValue ZTFrameIdx;
9243 MachineFrameInfo &MFI = MF.getFrameInfo();
9244 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9245
9246 // If the caller has ZT0 state which will not be preserved by the callee,
9247 // spill ZT0 before the call.
9248 if (ShouldPreserveZT0) {
9249 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9250 ZTFrameIdx = DAG.getFrameIndex(
9251 ZTObj,
9253
9254 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9255 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9256 }
9257
9258 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9259 // PSTATE.ZA before the call if there is no lazy-save active.
9260 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9261 assert((!DisableZA || !RequiresLazySave) &&
9262 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9263
9264 if (DisableZA)
9265 Chain = DAG.getNode(
9266 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9267 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9268
9269 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9270 // These operations are automatically eliminated by the prolog/epilog pass
9271 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9272 if (!IsSibCall) {
9273 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9274 if (ZAMarkerNode) {
9275 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9276 // using a chain can result in incorrect scheduling. The markers refer to
9277 // the position just before the CALLSEQ_START (though occur after as
9278 // CALLSEQ_START lacks in-glue).
9279 Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
9280 {Chain, Chain.getValue(1)});
9281 }
9282 }
9283
9284 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9286
9288 SmallSet<unsigned, 8> RegsUsed;
9289 SmallVector<SDValue, 8> MemOpChains;
9290 auto PtrVT = getPointerTy(DAG.getDataLayout());
9291
9292 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9293 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9294 for (const auto &F : Forwards) {
9295 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9296 RegsToPass.emplace_back(F.PReg, Val);
9297 }
9298 }
9299
9300 // Walk the register/memloc assignments, inserting copies/loads.
9301 unsigned ExtraArgLocs = 0;
9302 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9303 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9304 SDValue Arg = OutVals[i];
9305 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9306
9307 // Promote the value if needed.
9308 switch (VA.getLocInfo()) {
9309 default:
9310 llvm_unreachable("Unknown loc info!");
9311 case CCValAssign::Full:
9312 break;
9313 case CCValAssign::SExt:
9314 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9315 break;
9316 case CCValAssign::ZExt:
9317 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9318 break;
9319 case CCValAssign::AExt:
9320 if (Outs[i].ArgVT == MVT::i1) {
9321 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9322 //
9323 // Check if we actually have to do this, because the value may
9324 // already be zero-extended.
9325 //
9326 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9327 // and rely on DAGCombiner to fold this, because the following
9328 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9329 //
9330 // (ext (zext x)) -> (zext x)
9331 //
9332 // This will give us (zext i32), which we cannot remove, so
9333 // try to check this beforehand.
9334 if (!checkZExtBool(Arg, DAG)) {
9335 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9336 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9337 }
9338 }
9339 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9340 break;
9342 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9343 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9344 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9345 DAG.getConstant(32, DL, VA.getLocVT()));
9346 break;
9347 case CCValAssign::BCvt:
9348 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9349 break;
9350 case CCValAssign::Trunc:
9351 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9352 break;
9353 case CCValAssign::FPExt:
9354 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9355 break;
9357 bool isScalable = VA.getValVT().isScalableVT();
9358 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9359 "Indirect arguments should be scalable on most subtargets");
9360
9361 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9362 uint64_t PartSize = StoreSize;
9363 unsigned NumParts = 1;
9364 if (Outs[i].Flags.isInConsecutiveRegs()) {
9365 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9366 ++NumParts;
9367 StoreSize *= NumParts;
9368 }
9369
9370 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9371 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9372 MachineFrameInfo &MFI = MF.getFrameInfo();
9373 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9374 if (isScalable)
9376
9380 SDValue SpillSlot = Ptr;
9381
9382 // Ensure we generate all stores for each tuple part, whilst updating the
9383 // pointer after each store correctly using vscale.
9384 while (NumParts) {
9385 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9386 MemOpChains.push_back(Store);
9387
9388 NumParts--;
9389 if (NumParts > 0) {
9390 SDValue BytesIncrement;
9391 if (isScalable) {
9392 BytesIncrement = DAG.getVScale(
9393 DL, Ptr.getValueType(),
9394 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9395 } else {
9396 BytesIncrement = DAG.getConstant(
9397 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9398 Ptr.getValueType());
9399 }
9400 MPI = MachinePointerInfo(MPI.getAddrSpace());
9401 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9402 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9403 ExtraArgLocs++;
9404 i++;
9405 }
9406 }
9407
9408 Arg = SpillSlot;
9409 break;
9410 }
9411
9412 if (VA.isRegLoc()) {
9413 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9414 Outs[0].VT == MVT::i64) {
9415 assert(VA.getLocVT() == MVT::i64 &&
9416 "unexpected calling convention register assignment");
9417 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9418 "unexpected use of 'returned'");
9419 IsThisReturn = true;
9420 }
9421 if (RegsUsed.count(VA.getLocReg())) {
9422 // If this register has already been used then we're trying to pack
9423 // parts of an [N x i32] into an X-register. The extension type will
9424 // take care of putting the two halves in the right place but we have to
9425 // combine them.
9426 SDValue &Bits =
9427 llvm::find_if(RegsToPass,
9428 [=](const std::pair<unsigned, SDValue> &Elt) {
9429 return Elt.first == VA.getLocReg();
9430 })
9431 ->second;
9432 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9433 // Call site info is used for function's parameter entry value
9434 // tracking. For now we track only simple cases when parameter
9435 // is transferred through whole register.
9437 [&VA](MachineFunction::ArgRegPair ArgReg) {
9438 return ArgReg.Reg == VA.getLocReg();
9439 });
9440 } else {
9441 // Add an extra level of indirection for streaming mode changes by
9442 // using a pseudo copy node that cannot be rematerialised between a
9443 // smstart/smstop and the call by the simple register coalescer.
9444 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9445 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9446 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9447 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9448 RegsUsed.insert(VA.getLocReg());
9449 const TargetOptions &Options = DAG.getTarget().Options;
9450 if (Options.EmitCallSiteInfo)
9451 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9452 }
9453 } else {
9454 assert(VA.isMemLoc());
9455
9456 SDValue DstAddr;
9457 MachinePointerInfo DstInfo;
9458
9459 // FIXME: This works on big-endian for composite byvals, which are the
9460 // common case. It should also work for fundamental types too.
9461 uint32_t BEAlign = 0;
9462 unsigned OpSize;
9463 if (VA.getLocInfo() == CCValAssign::Indirect ||
9465 OpSize = VA.getLocVT().getFixedSizeInBits();
9466 else
9467 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9468 : VA.getValVT().getSizeInBits();
9469 OpSize = (OpSize + 7) / 8;
9470 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9471 !Flags.isInConsecutiveRegs()) {
9472 if (OpSize < 8)
9473 BEAlign = 8 - OpSize;
9474 }
9475 unsigned LocMemOffset = VA.getLocMemOffset();
9476 int32_t Offset = LocMemOffset + BEAlign;
9477
9478 if (IsTailCall) {
9479 // When the frame pointer is perfectly aligned for the tail call and the
9480 // same stack argument is passed down intact, we can reuse it.
9481 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9482 continue;
9483
9484 Offset = Offset + FPDiff;
9485 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9486
9487 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9488 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9489
9490 // Make sure any stack arguments overlapping with where we're storing
9491 // are loaded before this eventual operation. Otherwise they'll be
9492 // clobbered.
9493 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9494 } else {
9495 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9496
9497 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9498 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9499 }
9500
9501 if (Outs[i].Flags.isByVal()) {
9502 SDValue SizeNode =
9503 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9504 SDValue Cpy = DAG.getMemcpy(
9505 Chain, DL, DstAddr, Arg, SizeNode,
9506 Outs[i].Flags.getNonZeroByValAlign(),
9507 /*isVol = */ false, /*AlwaysInline = */ false,
9508 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9509
9510 MemOpChains.push_back(Cpy);
9511 } else {
9512 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9513 // promoted to a legal register type i32, we should truncate Arg back to
9514 // i1/i8/i16.
9515 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9516 VA.getValVT() == MVT::i16)
9517 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9518
9519 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9520 MemOpChains.push_back(Store);
9521 }
9522 }
9523 }
9524
9525 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9526 !(CLI.CB && CLI.CB->isMustTailCall())) {
9527 SDValue ParamPtr = StackPtr;
9528 if (IsTailCall) {
9529 // Create a dummy object at the top of the stack that can be used to get
9530 // the SP after the epilogue
9531 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9532 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9533 }
9534
9535 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9536 // describing the argument list. x4 contains the address of the
9537 // first stack parameter. x5 contains the size in bytes of all parameters
9538 // passed on the stack.
9539 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9540 RegsToPass.emplace_back(AArch64::X5,
9541 DAG.getConstant(NumBytes, DL, MVT::i64));
9542 }
9543
9544 if (!MemOpChains.empty())
9545 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9546
9547 SDValue InGlue;
9548 if (RequiresSMChange) {
9549 Chain =
9551 Chain, InGlue, getSMToggleCondition(CallAttrs));
9552 InGlue = Chain.getValue(1);
9553 }
9554
9555 // Build a sequence of copy-to-reg nodes chained together with token chain
9556 // and flag operands which copy the outgoing args into the appropriate regs.
9557 for (auto &RegToPass : RegsToPass) {
9558 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9559 RegToPass.second, InGlue);
9560 InGlue = Chain.getValue(1);
9561 }
9562
9563 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9564 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9565 // node so that legalize doesn't hack it.
9566 const GlobalValue *CalledGlobal = nullptr;
9567 unsigned OpFlags = 0;
9568 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9569 CalledGlobal = G->getGlobal();
9570 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9572 if (OpFlags & AArch64II::MO_GOT) {
9573 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9574 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9575 } else {
9576 const GlobalValue *GV = G->getGlobal();
9577 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9578 }
9579 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9580 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9581 Subtarget->isTargetMachO()) ||
9583 const char *Sym = S->getSymbol();
9584 if (UseGot) {
9586 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9587 } else {
9588 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9589 }
9590 }
9591
9592 // We don't usually want to end the call-sequence here because we would tidy
9593 // the frame up *after* the call, however in the ABI-changing tail-call case
9594 // we've carefully laid out the parameters so that when sp is reset they'll be
9595 // in the correct location.
9596 if (IsTailCall && !IsSibCall) {
9597 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9598 InGlue = Chain.getValue(1);
9599 }
9600
9601 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9602
9603 std::vector<SDValue> Ops;
9604 Ops.push_back(Chain);
9605 Ops.push_back(Callee);
9606
9607 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9608 // be expanded to the call, directly followed by a special marker sequence and
9609 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9610 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9611 assert(!IsTailCall &&
9612 "tail calls cannot be marked with clang.arc.attachedcall");
9613 Opc = AArch64ISD::CALL_RVMARKER;
9614
9615 // Add a target global address for the retainRV/claimRV runtime function
9616 // just before the call target.
9617 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9618 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9619 Ops.insert(Ops.begin() + 1, GA);
9620
9621 // We may or may not need to emit both the marker and the retain/claim call.
9622 // Tell the pseudo expansion using an additional boolean op.
9623 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9624 SDValue DoEmitMarker =
9625 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9626 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9627 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9628 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9629 } else if (GuardWithBTI) {
9630 Opc = AArch64ISD::CALL_BTI;
9631 }
9632
9633 if (IsTailCall) {
9634 // Each tail call may have to adjust the stack by a different amount, so
9635 // this information must travel along with the operation for eventual
9636 // consumption by emitEpilogue.
9637 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9638 }
9639
9640 if (CLI.PAI) {
9641 const uint64_t Key = CLI.PAI->Key;
9642 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9643 "Invalid auth call key");
9644
9645 // Split the discriminator into address/integer components.
9646 SDValue AddrDisc, IntDisc;
9647 std::tie(IntDisc, AddrDisc) =
9648 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9649
9650 if (Opc == AArch64ISD::CALL_RVMARKER)
9651 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9652 else
9653 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9654 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9655 Ops.push_back(IntDisc);
9656 Ops.push_back(AddrDisc);
9657 }
9658
9659 // Add argument registers to the end of the list so that they are known live
9660 // into the call.
9661 for (auto &RegToPass : RegsToPass)
9662 Ops.push_back(DAG.getRegister(RegToPass.first,
9663 RegToPass.second.getValueType()));
9664
9665 // Add a register mask operand representing the call-preserved registers.
9666 const uint32_t *Mask;
9667 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9668 if (IsThisReturn) {
9669 // For 'this' returns, use the X0-preserving mask if applicable
9670 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9671 if (!Mask) {
9672 IsThisReturn = false;
9673 Mask = TRI->getCallPreservedMask(MF, CallConv);
9674 }
9675 } else
9676 Mask = TRI->getCallPreservedMask(MF, CallConv);
9677
9678 if (Subtarget->hasCustomCallingConv())
9679 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9680
9681 if (TRI->isAnyArgRegReserved(MF))
9682 TRI->emitReservedArgRegCallError(MF);
9683
9684 assert(Mask && "Missing call preserved mask for calling convention");
9685 Ops.push_back(DAG.getRegisterMask(Mask));
9686
9687 if (InGlue.getNode())
9688 Ops.push_back(InGlue);
9689
9690 // If we're doing a tall call, use a TC_RETURN here rather than an
9691 // actual call instruction.
9692 if (IsTailCall) {
9694 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9695 if (IsCFICall)
9696 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9697
9698 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9699 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9700 if (CalledGlobal &&
9701 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9702 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9703 return Ret;
9704 }
9705
9706 // Returns a chain and a flag for retval copy to use.
9707 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9708 if (IsCFICall)
9709 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9710
9711 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9712 InGlue = Chain.getValue(1);
9713 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9714 if (CalledGlobal &&
9715 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9716 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9717
9718 uint64_t CalleePopBytes =
9719 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9720
9721 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9722 InGlue = Chain.getValue(1);
9723
9724 // Handle result values, copying them out of physregs into vregs that we
9725 // return.
9726 SDValue Result = LowerCallResult(
9727 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9728 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9729
9730 if (!Ins.empty())
9731 InGlue = Result.getValue(Result->getNumValues() - 1);
9732
9733 if (RequiresSMChange) {
9735 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
9736 getSMToggleCondition(CallAttrs));
9737 }
9738
9739 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
9740 // Unconditionally resume ZA.
9741 Result = DAG.getNode(
9742 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
9743 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9744
9745 if (ShouldPreserveZT0)
9746 Result =
9747 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9748 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9749
9750 if (RequiresLazySave) {
9751 // Conditionally restore the lazy save using a pseudo node.
9752 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
9753 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9754 SDValue RegMask = DAG.getRegisterMask(
9755 TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
9756 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9758 SDValue TPIDR2_EL0 = DAG.getNode(
9759 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9760 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9761 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9762 // RESTORE_ZA pseudo.
9763 SDValue Glue;
9764 SDValue TPIDR2Block = DAG.getFrameIndex(
9765 TPIDR2.FrameIndex,
9767 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9768 Result =
9769 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9770 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9771 RestoreRoutine, RegMask, Result.getValue(1)});
9772 // Finally reset the TPIDR2_EL0 register to 0.
9773 Result = DAG.getNode(
9774 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9775 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9776 DAG.getConstant(0, DL, MVT::i64));
9777 TPIDR2.Uses++;
9778 } else if (RequiresSaveAllZA) {
9779 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9780 /*IsSave=*/false);
9781 }
9782
9783 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9784 RequiresSaveAllZA) {
9785 for (unsigned I = 0; I < InVals.size(); ++I) {
9786 // The smstart/smstop is chained as part of the call, but when the
9787 // resulting chain is discarded (which happens when the call is not part
9788 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9789 // smstart/smstop is chained to the result value. We can do that by doing
9790 // a vreg -> vreg copy.
9793 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9794 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9795 InVals[I].getValueType());
9796 }
9797 }
9798
9799 if (CallConv == CallingConv::PreserveNone) {
9800 for (const ISD::OutputArg &O : Outs) {
9801 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9802 O.Flags.isSwiftAsync()) {
9805 MF.getFunction(),
9806 "Swift attributes can't be used with preserve_none",
9807 DL.getDebugLoc()));
9808 break;
9809 }
9810 }
9811 }
9812
9813 return Result;
9814}
9815
9816bool AArch64TargetLowering::CanLowerReturn(
9817 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9818 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9819 const Type *RetTy) const {
9820 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9822 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9823 return CCInfo.CheckReturn(Outs, RetCC);
9824}
9825
9826SDValue
9827AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9828 bool isVarArg,
9830 const SmallVectorImpl<SDValue> &OutVals,
9831 const SDLoc &DL, SelectionDAG &DAG) const {
9832 auto &MF = DAG.getMachineFunction();
9833 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9834
9835 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9837 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9838 CCInfo.AnalyzeReturn(Outs, RetCC);
9839
9840 // Copy the result values into the output registers.
9841 SDValue Glue;
9843 SmallSet<unsigned, 4> RegsUsed;
9844 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9845 ++i, ++realRVLocIdx) {
9846 CCValAssign &VA = RVLocs[i];
9847 assert(VA.isRegLoc() && "Can only return in registers!");
9848 SDValue Arg = OutVals[realRVLocIdx];
9849
9850 switch (VA.getLocInfo()) {
9851 default:
9852 llvm_unreachable("Unknown loc info!");
9853 case CCValAssign::Full:
9854 if (Outs[i].ArgVT == MVT::i1) {
9855 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9856 // value. This is strictly redundant on Darwin (which uses "zeroext
9857 // i1"), but will be optimised out before ISel.
9858 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9859 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9860 }
9861 break;
9862 case CCValAssign::BCvt:
9863 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9864 break;
9865 case CCValAssign::AExt:
9866 case CCValAssign::ZExt:
9867 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9868 break;
9870 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9871 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9872 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9873 DAG.getConstant(32, DL, VA.getLocVT()));
9874 break;
9875 }
9876
9877 if (RegsUsed.count(VA.getLocReg())) {
9878 SDValue &Bits =
9879 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9880 return Elt.first == VA.getLocReg();
9881 })->second;
9882 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9883 } else {
9884 RetVals.emplace_back(VA.getLocReg(), Arg);
9885 RegsUsed.insert(VA.getLocReg());
9886 }
9887 }
9888
9889 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9890
9891 // Emit SMSTOP before returning from a locally streaming function
9892 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
9893 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9894 if (FuncAttrs.hasStreamingCompatibleInterface())
9895 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9896 /*Glue*/ SDValue(),
9898 else
9899 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9900 /*Glue*/ SDValue(), AArch64SME::Always);
9901 Glue = Chain.getValue(1);
9902 }
9903
9904 SmallVector<SDValue, 4> RetOps(1, Chain);
9905 for (auto &RetVal : RetVals) {
9906 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9907 isPassedInFPR(RetVal.second.getValueType()))
9908 RetVal.second =
9909 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9910 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
9911 RetVal.second);
9912 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9913 Glue = Chain.getValue(1);
9914 RetOps.push_back(
9915 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9916 }
9917
9918 // Windows AArch64 ABIs require that for returning structs by value we copy
9919 // the sret argument into X0 for the return.
9920 // We saved the argument into a virtual register in the entry block,
9921 // so now we copy the value out and into X0.
9922 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9923 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9925
9926 unsigned RetValReg = AArch64::X0;
9927 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9928 RetValReg = AArch64::X8;
9929 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9930 Glue = Chain.getValue(1);
9931
9932 RetOps.push_back(
9933 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9934 }
9935
9936 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9937 if (I) {
9938 for (; *I; ++I) {
9939 if (AArch64::GPR64RegClass.contains(*I))
9940 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9941 else if (AArch64::FPR64RegClass.contains(*I))
9942 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9943 else
9944 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9945 }
9946 }
9947
9948 RetOps[0] = Chain; // Update chain.
9949
9950 // Add the glue if we have it.
9951 if (Glue.getNode())
9952 RetOps.push_back(Glue);
9953
9954 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9955 // ARM64EC entry thunks use a special return sequence: instead of a regular
9956 // "ret" instruction, they need to explicitly call the emulator.
9957 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9958 SDValue Arm64ECRetDest =
9959 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9960 Arm64ECRetDest =
9961 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9962 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9964 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9965 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9966 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9967 }
9968
9969 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9970}
9971
9972//===----------------------------------------------------------------------===//
9973// Other Lowering Code
9974//===----------------------------------------------------------------------===//
9975
9976SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9977 SelectionDAG &DAG,
9978 unsigned Flag) const {
9979 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9980 N->getOffset(), Flag);
9981}
9982
9983SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9984 SelectionDAG &DAG,
9985 unsigned Flag) const {
9986 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9987}
9988
9989SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9990 SelectionDAG &DAG,
9991 unsigned Flag) const {
9992 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9993 N->getOffset(), Flag);
9994}
9995
9996SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9997 SelectionDAG &DAG,
9998 unsigned Flag) const {
9999 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10000}
10001
10002SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10003 SelectionDAG &DAG,
10004 unsigned Flag) const {
10005 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10006}
10007
10008// (loadGOT sym)
10009template <class NodeTy>
10010SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10011 unsigned Flags) const {
10012 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10013 SDLoc DL(N);
10014 EVT Ty = getPointerTy(DAG.getDataLayout());
10015 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10016 // FIXME: Once remat is capable of dealing with instructions with register
10017 // operands, expand this into two nodes instead of using a wrapper node.
10018 if (DAG.getMachineFunction()
10020 ->hasELFSignedGOT())
10021 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10022 0);
10023 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10024}
10025
10026// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10027template <class NodeTy>
10028SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10029 unsigned Flags) const {
10030 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10031 SDLoc DL(N);
10032 EVT Ty = getPointerTy(DAG.getDataLayout());
10033 const unsigned char MO_NC = AArch64II::MO_NC;
10034 return DAG.getNode(
10035 AArch64ISD::WrapperLarge, DL, Ty,
10036 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10037 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10038 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10039 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10040}
10041
10042// (addlow (adrp %hi(sym)) %lo(sym))
10043template <class NodeTy>
10044SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10045 unsigned Flags) const {
10046 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10047 SDLoc DL(N);
10048 EVT Ty = getPointerTy(DAG.getDataLayout());
10049 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10050 SDValue Lo = getTargetNode(N, Ty, DAG,
10052 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10053 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10054}
10055
10056// (adr sym)
10057template <class NodeTy>
10058SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10059 unsigned Flags) const {
10060 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10061 SDLoc DL(N);
10062 EVT Ty = getPointerTy(DAG.getDataLayout());
10063 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10064 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10065}
10066
10067SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10068 SelectionDAG &DAG) const {
10069 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10070 const GlobalValue *GV = GN->getGlobal();
10071 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10072
10073 if (OpFlags != AArch64II::MO_NO_FLAG)
10074 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
10075 "unexpected offset in global node");
10076
10077 // This also catches the large code model case for Darwin, and tiny code
10078 // model with got relocations.
10079 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10080 return getGOT(GN, DAG, OpFlags);
10081 }
10082
10086 Result = getAddrLarge(GN, DAG, OpFlags);
10087 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10088 Result = getAddrTiny(GN, DAG, OpFlags);
10089 } else {
10090 Result = getAddr(GN, DAG, OpFlags);
10091 }
10092 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10093 SDLoc DL(GN);
10095 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10097 return Result;
10098}
10099
10100/// Convert a TLS address reference into the correct sequence of loads
10101/// and calls to compute the variable's address (for Darwin, currently) and
10102/// return an SDValue containing the final node.
10103
10104/// Darwin only has one TLS scheme which must be capable of dealing with the
10105/// fully general situation, in the worst case. This means:
10106/// + "extern __thread" declaration.
10107/// + Defined in a possibly unknown dynamic library.
10108///
10109/// The general system is that each __thread variable has a [3 x i64] descriptor
10110/// which contains information used by the runtime to calculate the address. The
10111/// only part of this the compiler needs to know about is the first xword, which
10112/// contains a function pointer that must be called with the address of the
10113/// entire descriptor in "x0".
10114///
10115/// Since this descriptor may be in a different unit, in general even the
10116/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10117/// is:
10118/// adrp x0, _var@TLVPPAGE
10119/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10120/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10121/// ; the function pointer
10122/// blr x1 ; Uses descriptor address in x0
10123/// ; Address of _var is now in x0.
10124///
10125/// If the address of _var's descriptor *is* known to the linker, then it can
10126/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10127/// a slight efficiency gain.
10128SDValue
10129AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10130 SelectionDAG &DAG) const {
10131 assert(Subtarget->isTargetDarwin() &&
10132 "This function expects a Darwin target");
10133
10134 SDLoc DL(Op);
10135 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10136 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10137 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10138
10139 SDValue TLVPAddr =
10140 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10141 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10142
10143 // The first entry in the descriptor is a function pointer that we must call
10144 // to obtain the address of the variable.
10145 SDValue Chain = DAG.getEntryNode();
10146 SDValue FuncTLVGet = DAG.getLoad(
10147 PtrMemVT, DL, Chain, DescAddr,
10149 Align(PtrMemVT.getSizeInBits() / 8),
10151 Chain = FuncTLVGet.getValue(1);
10152
10153 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10154 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10155
10157 MFI.setAdjustsStack(true);
10158
10159 // TLS calls preserve all registers except those that absolutely must be
10160 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10161 // silly).
10162 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10163 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10164 if (Subtarget->hasCustomCallingConv())
10165 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10166
10167 // Finally, we can make the call. This is just a degenerate version of a
10168 // normal AArch64 call node: x0 takes the address of the descriptor, and
10169 // returns the address of the variable in this thread.
10170 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10171
10172 unsigned Opcode = AArch64ISD::CALL;
10174 Ops.push_back(Chain);
10175 Ops.push_back(FuncTLVGet);
10176
10177 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10178 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10179 Opcode = AArch64ISD::AUTH_CALL;
10180 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10181 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10182 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10183 }
10184
10185 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10186 Ops.push_back(DAG.getRegisterMask(Mask));
10187 Ops.push_back(Chain.getValue(1));
10188 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10189 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10190}
10191
10192/// Convert a thread-local variable reference into a sequence of instructions to
10193/// compute the variable's address for the local exec TLS model of ELF targets.
10194/// The sequence depends on the maximum TLS area size.
10195SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10196 SDValue ThreadBase,
10197 const SDLoc &DL,
10198 SelectionDAG &DAG) const {
10199 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10200 SDValue TPOff, Addr;
10201
10202 switch (DAG.getTarget().Options.TLSSize) {
10203 default:
10204 llvm_unreachable("Unexpected TLS size");
10205
10206 case 12: {
10207 // mrs x0, TPIDR_EL0
10208 // add x0, x0, :tprel_lo12:a
10210 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10211 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10212 Var,
10213 DAG.getTargetConstant(0, DL, MVT::i32)),
10214 0);
10215 }
10216
10217 case 24: {
10218 // mrs x0, TPIDR_EL0
10219 // add x0, x0, :tprel_hi12:a
10220 // add x0, x0, :tprel_lo12_nc:a
10221 SDValue HiVar = DAG.getTargetGlobalAddress(
10222 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10223 SDValue LoVar = DAG.getTargetGlobalAddress(
10224 GV, DL, PtrVT, 0,
10226 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10227 HiVar,
10228 DAG.getTargetConstant(0, DL, MVT::i32)),
10229 0);
10230 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10231 LoVar,
10232 DAG.getTargetConstant(0, DL, MVT::i32)),
10233 0);
10234 }
10235
10236 case 32: {
10237 // mrs x1, TPIDR_EL0
10238 // movz x0, #:tprel_g1:a
10239 // movk x0, #:tprel_g0_nc:a
10240 // add x0, x1, x0
10241 SDValue HiVar = DAG.getTargetGlobalAddress(
10242 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10243 SDValue LoVar = DAG.getTargetGlobalAddress(
10244 GV, DL, PtrVT, 0,
10246 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10247 DAG.getTargetConstant(16, DL, MVT::i32)),
10248 0);
10249 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10250 DAG.getTargetConstant(0, DL, MVT::i32)),
10251 0);
10252 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10253 }
10254
10255 case 48: {
10256 // mrs x1, TPIDR_EL0
10257 // movz x0, #:tprel_g2:a
10258 // movk x0, #:tprel_g1_nc:a
10259 // movk x0, #:tprel_g0_nc:a
10260 // add x0, x1, x0
10261 SDValue HiVar = DAG.getTargetGlobalAddress(
10262 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10263 SDValue MiVar = DAG.getTargetGlobalAddress(
10264 GV, DL, PtrVT, 0,
10266 SDValue LoVar = DAG.getTargetGlobalAddress(
10267 GV, DL, PtrVT, 0,
10269 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10270 DAG.getTargetConstant(32, DL, MVT::i32)),
10271 0);
10272 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10273 DAG.getTargetConstant(16, DL, MVT::i32)),
10274 0);
10275 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10276 DAG.getTargetConstant(0, DL, MVT::i32)),
10277 0);
10278 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10279 }
10280 }
10281}
10282
10283/// When accessing thread-local variables under either the general-dynamic or
10284/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10285/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10286/// is a function pointer to carry out the resolution.
10287///
10288/// The sequence is:
10289/// adrp x0, :tlsdesc:var
10290/// ldr x1, [x0, #:tlsdesc_lo12:var]
10291/// add x0, x0, #:tlsdesc_lo12:var
10292/// .tlsdesccall var
10293/// blr x1
10294/// (TPIDR_EL0 offset now in x0)
10295///
10296/// The above sequence must be produced unscheduled, to enable the linker to
10297/// optimize/relax this sequence.
10298/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10299/// above sequence, and expanded really late in the compilation flow, to ensure
10300/// the sequence is produced as per above.
10301SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10302 const SDLoc &DL,
10303 SelectionDAG &DAG) const {
10304 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10305
10306 SDValue Chain = DAG.getEntryNode();
10307 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10308
10309 unsigned Opcode =
10310 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10311 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10312 : AArch64ISD::TLSDESC_CALLSEQ;
10313 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10314 SDValue Glue = Chain.getValue(1);
10315
10316 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10317}
10318
10319SDValue
10320AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10321 SelectionDAG &DAG) const {
10322 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10323
10324 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10325 AArch64FunctionInfo *MFI =
10327
10331
10333 if (Model == TLSModel::LocalDynamic)
10335 }
10336
10338 Model != TLSModel::LocalExec)
10339 report_fatal_error("ELF TLS only supported in small memory model or "
10340 "in local exec TLS model");
10341 // Different choices can be made for the maximum size of the TLS area for a
10342 // module. For the small address model, the default TLS size is 16MiB and the
10343 // maximum TLS size is 4GiB.
10344 // FIXME: add tiny and large code model support for TLS access models other
10345 // than local exec. We currently generate the same code as small for tiny,
10346 // which may be larger than needed.
10347
10348 SDValue TPOff;
10349 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10350 SDLoc DL(Op);
10351 const GlobalValue *GV = GA->getGlobal();
10352
10353 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10354
10355 if (Model == TLSModel::LocalExec) {
10356 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10357 } else if (Model == TLSModel::InitialExec) {
10358 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10359 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10360 } else if (Model == TLSModel::LocalDynamic) {
10361 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10362 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10363 // the beginning of the module's TLS region, followed by a DTPREL offset
10364 // calculation.
10365
10366 // These accesses will need deduplicating if there's more than one.
10368
10369 // The call needs a relocation too for linker relaxation. It doesn't make
10370 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10371 // the address.
10372 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10374
10375 // Now we can calculate the offset from TPIDR_EL0 to this module's
10376 // thread-local area.
10377 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10378
10379 // Now use :dtprel_whatever: operations to calculate this variable's offset
10380 // in its thread-storage area.
10381 SDValue HiVar = DAG.getTargetGlobalAddress(
10382 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10383 SDValue LoVar = DAG.getTargetGlobalAddress(
10384 GV, DL, MVT::i64, 0,
10386
10387 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10388 DAG.getTargetConstant(0, DL, MVT::i32)),
10389 0);
10390 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10391 DAG.getTargetConstant(0, DL, MVT::i32)),
10392 0);
10393 } else if (Model == TLSModel::GeneralDynamic) {
10394 // The call needs a relocation too for linker relaxation. It doesn't make
10395 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10396 // the address.
10397 SDValue SymAddr =
10398 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10399
10400 // Finally we can make a call to calculate the offset from tpidr_el0.
10401 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10402 } else
10403 llvm_unreachable("Unsupported ELF TLS access model");
10404
10405 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10406}
10407
10408SDValue
10409AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10410 SelectionDAG &DAG) const {
10411 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10412
10413 SDValue Chain = DAG.getEntryNode();
10414 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10415 SDLoc DL(Op);
10416
10417 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10418
10419 // Load the ThreadLocalStoragePointer from the TEB
10420 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10421 SDValue TLSArray =
10422 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10423 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10424 Chain = TLSArray.getValue(1);
10425
10426 // Load the TLS index from the C runtime;
10427 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10428 // This also does the same as LOADgot, but using a generic i32 load,
10429 // while LOADgot only loads i64.
10430 SDValue TLSIndexHi =
10431 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10432 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10433 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10434 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10435 SDValue TLSIndex =
10436 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10437 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10438 Chain = TLSIndex.getValue(1);
10439
10440 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10441 // offset into the TLSArray.
10442 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10443 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10444 DAG.getConstant(3, DL, PtrVT));
10445 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10446 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10448 Chain = TLS.getValue(1);
10449
10450 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10451 const GlobalValue *GV = GA->getGlobal();
10452 SDValue TGAHi = DAG.getTargetGlobalAddress(
10453 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10454 SDValue TGALo = DAG.getTargetGlobalAddress(
10455 GV, DL, PtrVT, 0,
10457
10458 // Add the offset from the start of the .tls section (section base).
10459 SDValue Addr =
10460 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10461 DAG.getTargetConstant(0, DL, MVT::i32)),
10462 0);
10463 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10464 return Addr;
10465}
10466
10467SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10468 SelectionDAG &DAG) const {
10469 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10470 if (DAG.getTarget().useEmulatedTLS())
10471 return LowerToTLSEmulatedModel(GA, DAG);
10472
10473 if (Subtarget->isTargetDarwin())
10474 return LowerDarwinGlobalTLSAddress(Op, DAG);
10475 if (Subtarget->isTargetELF())
10476 return LowerELFGlobalTLSAddress(Op, DAG);
10477 if (Subtarget->isTargetWindows())
10478 return LowerWindowsGlobalTLSAddress(Op, DAG);
10479
10480 llvm_unreachable("Unexpected platform trying to use TLS");
10481}
10482
10483//===----------------------------------------------------------------------===//
10484// PtrAuthGlobalAddress lowering
10485//
10486// We have 3 lowering alternatives to choose from:
10487// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10488// If the GV doesn't need a GOT load (i.e., is locally defined)
10489// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10490//
10491// - LOADgotPAC: similar to LOADgot, with added PAC.
10492// If the GV needs a GOT load, materialize the pointer using the usual
10493// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10494// section is assumed to be read-only (for example, via relro mechanism). See
10495// LowerMOVaddrPAC.
10496//
10497// - LOADauthptrstatic: similar to LOADgot, but use a
10498// special stub slot instead of a GOT slot.
10499// Load a signed pointer for symbol 'sym' from a stub slot named
10500// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10501// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10502// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10503//
10504// All 3 are pseudos that are expand late to longer sequences: this lets us
10505// provide integrity guarantees on the to-be-signed intermediate values.
10506//
10507// LOADauthptrstatic is undesirable because it requires a large section filled
10508// with often similarly-signed pointers, making it a good harvesting target.
10509// Thus, it's only used for ptrauth references to extern_weak to avoid null
10510// checks.
10511
10513 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10514 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10515 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10516 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10517
10518 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10519 // offset alone as a pointer if the symbol wasn't available, which would
10520 // probably break null checks in users. Ptrauth complicates things further:
10521 // error out.
10522 if (TGN->getOffset() != 0)
10524 "unsupported non-zero offset in weak ptrauth global reference");
10525
10526 if (!isNullConstant(AddrDiscriminator))
10527 report_fatal_error("unsupported weak addr-div ptrauth global");
10528
10529 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10530 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10531 {TGA, Key, Discriminator}),
10532 0);
10533}
10534
10535SDValue
10536AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10537 SelectionDAG &DAG) const {
10538 SDValue Ptr = Op.getOperand(0);
10539 uint64_t KeyC = Op.getConstantOperandVal(1);
10540 SDValue AddrDiscriminator = Op.getOperand(2);
10541 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10542 EVT VT = Op.getValueType();
10543 SDLoc DL(Op);
10544
10545 if (KeyC > AArch64PACKey::LAST)
10546 report_fatal_error("key in ptrauth global out of range [0, " +
10547 Twine((int)AArch64PACKey::LAST) + "]");
10548
10549 // Blend only works if the integer discriminator is 16-bit wide.
10550 if (!isUInt<16>(DiscriminatorC))
10552 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10553
10554 // Choosing between 3 lowering alternatives is target-specific.
10555 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10556 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10557
10558 int64_t PtrOffsetC = 0;
10559 if (Ptr.getOpcode() == ISD::ADD) {
10560 PtrOffsetC = Ptr.getConstantOperandVal(1);
10561 Ptr = Ptr.getOperand(0);
10562 }
10563 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10564 const GlobalValue *PtrGV = PtrN->getGlobal();
10565
10566 // Classify the reference to determine whether it needs a GOT load.
10567 const unsigned OpFlags =
10568 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10569 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10570 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10571 "unsupported non-GOT op flags on ptrauth global reference");
10572
10573 // Fold any offset into the GV; our pseudos expect it there.
10574 PtrOffsetC += PtrN->getOffset();
10575 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10576 /*TargetFlags=*/0);
10577 assert(PtrN->getTargetFlags() == 0 &&
10578 "unsupported target flags on ptrauth global");
10579
10580 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10581 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10582 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10583 ? AddrDiscriminator
10584 : DAG.getRegister(AArch64::XZR, MVT::i64);
10585
10586 // No GOT load needed -> MOVaddrPAC
10587 if (!NeedsGOTLoad) {
10588 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10589 return SDValue(
10590 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10591 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10592 0);
10593 }
10594
10595 // GOT load -> LOADgotPAC
10596 // Note that we disallow extern_weak refs to avoid null checks later.
10597 if (!PtrGV->hasExternalWeakLinkage())
10598 return SDValue(
10599 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10600 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10601 0);
10602
10603 // extern_weak ref -> LOADauthptrstatic
10605 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10606 DAG);
10607}
10608
10609// Looks through \param Val to determine the bit that can be used to
10610// check the sign of the value. It returns the unextended value and
10611// the sign bit position.
10612std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10613 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10614 return {Val.getOperand(0),
10615 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10616 1};
10617
10618 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10619 return {Val.getOperand(0),
10620 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10621
10622 return {Val, Val.getValueSizeInBits() - 1};
10623}
10624
10625SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10626 SDValue Chain = Op.getOperand(0);
10627 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10628 SDValue LHS = Op.getOperand(2);
10629 SDValue RHS = Op.getOperand(3);
10630 SDValue Dest = Op.getOperand(4);
10631 SDLoc DL(Op);
10632
10634 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10635 // will not be produced, as they are conditional branch instructions that do
10636 // not set flags.
10637 bool ProduceNonFlagSettingCondBr =
10638 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10639
10640 // Handle f128 first, since lowering it will result in comparing the return
10641 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10642 // is expecting to deal with.
10643 if (LHS.getValueType() == MVT::f128) {
10644 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10645
10646 // If softenSetCCOperands returned a scalar, we need to compare the result
10647 // against zero to select between true and false values.
10648 if (!RHS.getNode()) {
10649 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10650 CC = ISD::SETNE;
10651 }
10652 }
10653
10654 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10655 // instruction.
10656 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10657 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10658 // Only lower legal XALUO ops.
10659 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10660 return SDValue();
10661
10662 // The actual operation with overflow check.
10664 SDValue Value, Overflow;
10665 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10666
10667 if (CC == ISD::SETNE)
10668 OFCC = getInvertedCondCode(OFCC);
10669 SDValue CCVal = getCondCode(DAG, OFCC);
10670
10671 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10672 Overflow);
10673 }
10674
10675 if (LHS.getValueType().isInteger()) {
10676 assert((LHS.getValueType() == RHS.getValueType()) &&
10677 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10678
10679 // If the RHS of the comparison is zero, we can potentially fold this
10680 // to a specialized branch.
10681 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10682 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10683 if (CC == ISD::SETEQ) {
10684 // See if we can use a TBZ to fold in an AND as well.
10685 // TBZ has a smaller branch displacement than CBZ. If the offset is
10686 // out of bounds, a late MI-layer pass rewrites branches.
10687 // 403.gcc is an example that hits this case.
10688 if (LHS.getOpcode() == ISD::AND &&
10689 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10690 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10691 SDValue Test = LHS.getOperand(0);
10692 uint64_t Mask = LHS.getConstantOperandVal(1);
10693 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
10694 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10695 Dest);
10696 }
10697
10698 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
10699 } else if (CC == ISD::SETNE) {
10700 // See if we can use a TBZ to fold in an AND as well.
10701 // TBZ has a smaller branch displacement than CBZ. If the offset is
10702 // out of bounds, a late MI-layer pass rewrites branches.
10703 // 403.gcc is an example that hits this case.
10704 if (LHS.getOpcode() == ISD::AND &&
10705 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10706 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10707 SDValue Test = LHS.getOperand(0);
10708 uint64_t Mask = LHS.getConstantOperandVal(1);
10709 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
10710 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10711 Dest);
10712 }
10713
10714 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
10715 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10716 // Don't combine AND since emitComparison converts the AND to an ANDS
10717 // (a.k.a. TST) and the test in the test bit and branch instruction
10718 // becomes redundant. This would also increase register pressure.
10719 uint64_t SignBitPos;
10720 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10721 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
10722 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
10723 }
10724 }
10725 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10726 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10727 // Don't combine AND since emitComparison converts the AND to an ANDS
10728 // (a.k.a. TST) and the test in the test bit and branch instruction
10729 // becomes redundant. This would also increase register pressure.
10730 uint64_t SignBitPos;
10731 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10732 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
10733 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
10734 }
10735
10736 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
10737 // larger branch displacement but do prefer CB over cmp + br.
10738 if (Subtarget->hasCMPBR() &&
10740 ProduceNonFlagSettingCondBr) {
10741 SDValue Cond =
10743 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
10744 Dest);
10745 }
10746
10747 SDValue CCVal;
10748 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
10749 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10750 Cmp);
10751 }
10752
10753 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10754 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10755
10756 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10757 // clean. Some of them require two branches to implement.
10758 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
10759 AArch64CC::CondCode CC1, CC2;
10760 changeFPCCToAArch64CC(CC, CC1, CC2);
10761 SDValue CC1Val = getCondCode(DAG, CC1);
10762 SDValue BR1 =
10763 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
10764 if (CC2 != AArch64CC::AL) {
10765 SDValue CC2Val = getCondCode(DAG, CC2);
10766 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
10767 Cmp);
10768 }
10769
10770 return BR1;
10771}
10772
10773SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10774 SelectionDAG &DAG) const {
10775 if (!Subtarget->isNeonAvailable() &&
10776 !Subtarget->useSVEForFixedLengthVectors())
10777 return SDValue();
10778
10779 EVT VT = Op.getValueType();
10780 EVT IntVT = VT.changeTypeToInteger();
10781 SDLoc DL(Op);
10782
10783 SDValue In1 = Op.getOperand(0);
10784 SDValue In2 = Op.getOperand(1);
10785 EVT SrcVT = In2.getValueType();
10786
10787 if (!SrcVT.bitsEq(VT))
10788 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10789
10790 if (VT.isScalableVector())
10791 IntVT =
10793
10794 if (VT.isFixedLengthVector() &&
10795 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10796 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10797
10798 In1 = convertToScalableVector(DAG, ContainerVT, In1);
10799 In2 = convertToScalableVector(DAG, ContainerVT, In2);
10800
10801 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10802 return convertFromScalableVector(DAG, VT, Res);
10803 }
10804
10805 // With SVE, but without Neon, extend the scalars to scalable vectors and use
10806 // a SVE FCOPYSIGN.
10807 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
10808 Subtarget->isSVEorStreamingSVEAvailable()) {
10809 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
10810 return SDValue();
10811 EVT SVT = getPackedSVEVectorVT(VT);
10812
10813 SDValue Ins1 =
10814 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
10815 DAG.getConstant(0, DL, MVT::i64));
10816 SDValue Ins2 =
10817 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
10818 DAG.getConstant(0, DL, MVT::i64));
10819 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
10820 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
10821 DAG.getConstant(0, DL, MVT::i64));
10822 }
10823
10824 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10825 if (VT.isScalableVector())
10826 return getSVESafeBitCast(VT, Op, DAG);
10827
10828 return DAG.getBitcast(VT, Op);
10829 };
10830
10831 SDValue VecVal1, VecVal2;
10832 EVT VecVT;
10833 auto SetVecVal = [&](int Idx = -1) {
10834 if (!VT.isVector()) {
10835 VecVal1 =
10836 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10837 VecVal2 =
10838 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10839 } else {
10840 VecVal1 = BitCast(VecVT, In1, DAG);
10841 VecVal2 = BitCast(VecVT, In2, DAG);
10842 }
10843 };
10844 if (VT.isVector()) {
10845 VecVT = IntVT;
10846 SetVecVal();
10847 } else if (VT == MVT::f64) {
10848 VecVT = MVT::v2i64;
10849 SetVecVal(AArch64::dsub);
10850 } else if (VT == MVT::f32) {
10851 VecVT = MVT::v4i32;
10852 SetVecVal(AArch64::ssub);
10853 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10854 VecVT = MVT::v8i16;
10855 SetVecVal(AArch64::hsub);
10856 } else {
10857 llvm_unreachable("Invalid type for copysign!");
10858 }
10859
10860 unsigned BitWidth = In1.getScalarValueSizeInBits();
10861 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10862
10863 // We want to materialize a mask with every bit but the high bit set, but the
10864 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10865 // 64-bit elements. Instead, materialize all bits set and then negate that.
10866 if (VT == MVT::f64 || VT == MVT::v2f64) {
10867 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10868 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10869 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10870 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10871 }
10872
10873 SDValue BSP =
10874 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10875 if (VT == MVT::f16 || VT == MVT::bf16)
10876 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10877 if (VT == MVT::f32)
10878 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10879 if (VT == MVT::f64)
10880 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10881
10882 return BitCast(VT, BSP, DAG);
10883}
10884
10885SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10886 SelectionDAG &DAG) const {
10888 Attribute::NoImplicitFloat))
10889 return SDValue();
10890
10891 EVT VT = Op.getValueType();
10892 if (VT.isScalableVector() ||
10893 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
10894 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10895
10896 bool IsParity = Op.getOpcode() == ISD::PARITY;
10897 SDValue Val = Op.getOperand(0);
10898 SDLoc DL(Op);
10899
10900 // for i32, general parity function using EORs is more efficient compared to
10901 // using floating point
10902 if (VT == MVT::i32 && IsParity)
10903 return SDValue();
10904
10905 if (Subtarget->isSVEorStreamingSVEAvailable()) {
10906 if (VT == MVT::i32 || VT == MVT::i64) {
10907 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
10908 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
10909 DAG.getUNDEF(ContainerVT), Val,
10910 DAG.getVectorIdxConstant(0, DL));
10911 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
10912 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
10913 DAG.getVectorIdxConstant(0, DL));
10914 if (IsParity)
10915 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
10916 return Val;
10917 }
10918
10919 if (VT == MVT::i128) {
10920 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
10921 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
10922 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
10923 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
10924 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
10925 Val = DAG.getZExtOrTrunc(Val, DL, VT);
10926 if (IsParity)
10927 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
10928 return Val;
10929 }
10930 }
10931
10932 if (!Subtarget->isNeonAvailable())
10933 return SDValue();
10934
10935 // If there is no CNT instruction available, GPR popcount can
10936 // be more efficiently lowered to the following sequence that uses
10937 // AdvSIMD registers/instructions as long as the copies to/from
10938 // the AdvSIMD registers are cheap.
10939 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10940 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10941 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10942 // FMOV X0, D0 // copy result back to integer reg
10943 if (VT == MVT::i32 || VT == MVT::i64) {
10944 if (VT == MVT::i32)
10945 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10946 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10947
10948 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10949 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10950 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
10951 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
10952 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
10953 DAG.getConstant(0, DL, MVT::i64));
10954 if (IsParity)
10955 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10956 return AddV;
10957 } else if (VT == MVT::i128) {
10958 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10959
10960 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10961 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10962 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
10963 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
10964 DAG.getConstant(0, DL, MVT::i64));
10965 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
10966 if (IsParity)
10967 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10968 return AddV;
10969 }
10970
10971 assert(!IsParity && "ISD::PARITY of vector types not supported");
10972
10973 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10974 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10975 "Unexpected type for custom ctpop lowering");
10976
10977 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10978 Val = DAG.getBitcast(VT8Bit, Val);
10979 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10980
10981 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10982 VT.getVectorNumElements() >= 2) {
10983 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10984 SDValue Zeros = DAG.getConstant(0, DL, DT);
10985 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10986
10987 if (VT == MVT::v2i64) {
10988 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10989 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10990 } else if (VT == MVT::v2i32) {
10991 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10992 } else if (VT == MVT::v4i32) {
10993 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10994 } else {
10995 llvm_unreachable("Unexpected type for custom ctpop lowering");
10996 }
10997
10998 return Val;
10999 }
11000
11001 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11002 unsigned EltSize = 8;
11003 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11004 while (EltSize != VT.getScalarSizeInBits()) {
11005 EltSize *= 2;
11006 NumElts /= 2;
11007 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11008 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11009 }
11010
11011 return Val;
11012}
11013
11014SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11015 EVT VT = Op.getValueType();
11016 assert(VT.isScalableVector() ||
11018 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11019
11020 SDLoc DL(Op);
11021 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11022 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11023}
11024
11025SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11026 SelectionDAG &DAG) const {
11027
11028 EVT VT = Op.getValueType();
11029 SDLoc DL(Op);
11030 unsigned Opcode = Op.getOpcode();
11031 ISD::CondCode CC;
11032 switch (Opcode) {
11033 default:
11034 llvm_unreachable("Wrong instruction");
11035 case ISD::SMAX:
11036 CC = ISD::SETGT;
11037 break;
11038 case ISD::SMIN:
11039 CC = ISD::SETLT;
11040 break;
11041 case ISD::UMAX:
11042 CC = ISD::SETUGT;
11043 break;
11044 case ISD::UMIN:
11045 CC = ISD::SETULT;
11046 break;
11047 }
11048
11049 if (VT.isScalableVector() ||
11051 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11052 switch (Opcode) {
11053 default:
11054 llvm_unreachable("Wrong instruction");
11055 case ISD::SMAX:
11056 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11057 case ISD::SMIN:
11058 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11059 case ISD::UMAX:
11060 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11061 case ISD::UMIN:
11062 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11063 }
11064 }
11065
11066 SDValue Op0 = Op.getOperand(0);
11067 SDValue Op1 = Op.getOperand(1);
11068 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11069 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11070}
11071
11072SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11073 SelectionDAG &DAG) const {
11074 EVT VT = Op.getValueType();
11075
11076 if (VT.isScalableVector() ||
11078 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11079 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11080
11081 SDLoc DL(Op);
11082 SDValue REVB;
11083 MVT VST;
11084
11085 switch (VT.getSimpleVT().SimpleTy) {
11086 default:
11087 llvm_unreachable("Invalid type for bitreverse!");
11088
11089 case MVT::v2i32: {
11090 VST = MVT::v8i8;
11091 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11092
11093 break;
11094 }
11095
11096 case MVT::v4i32: {
11097 VST = MVT::v16i8;
11098 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11099
11100 break;
11101 }
11102
11103 case MVT::v1i64: {
11104 VST = MVT::v8i8;
11105 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11106
11107 break;
11108 }
11109
11110 case MVT::v2i64: {
11111 VST = MVT::v16i8;
11112 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11113
11114 break;
11115 }
11116 }
11117
11118 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11119 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11120}
11121
11122// Check whether the continuous comparison sequence.
11123static bool
11124isOrXorChain(SDValue N, unsigned &Num,
11125 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11126 if (Num == MaxXors)
11127 return false;
11128
11129 // Skip the one-use zext
11130 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11131 N = N->getOperand(0);
11132
11133 // The leaf node must be XOR
11134 if (N->getOpcode() == ISD::XOR) {
11135 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11136 Num++;
11137 return true;
11138 }
11139
11140 // All the non-leaf nodes must be OR.
11141 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11142 return false;
11143
11144 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11145 isOrXorChain(N->getOperand(1), Num, WorkList))
11146 return true;
11147 return false;
11148}
11149
11150// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11152 SDValue LHS = N->getOperand(0);
11153 SDValue RHS = N->getOperand(1);
11154 SDLoc DL(N);
11155 EVT VT = N->getValueType(0);
11157
11158 // Only handle integer compares.
11159 if (N->getOpcode() != ISD::SETCC)
11160 return SDValue();
11161
11162 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11163 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11164 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11165 unsigned NumXors = 0;
11166 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11167 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11168 isOrXorChain(LHS, NumXors, WorkList)) {
11169 SDValue XOR0, XOR1;
11170 std::tie(XOR0, XOR1) = WorkList[0];
11171 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11172 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11173 for (unsigned I = 1; I < WorkList.size(); I++) {
11174 std::tie(XOR0, XOR1) = WorkList[I];
11175 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11176 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11177 }
11178
11179 // Exit early by inverting the condition, which help reduce indentations.
11180 return Cmp;
11181 }
11182
11183 return SDValue();
11184}
11185
11186SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11187
11188 if (Op.getValueType().isVector())
11189 return LowerVSETCC(Op, DAG);
11190
11191 bool IsStrict = Op->isStrictFPOpcode();
11192 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11193 unsigned OpNo = IsStrict ? 1 : 0;
11194 SDValue Chain;
11195 if (IsStrict)
11196 Chain = Op.getOperand(0);
11197 SDValue LHS = Op.getOperand(OpNo + 0);
11198 SDValue RHS = Op.getOperand(OpNo + 1);
11199 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11200 SDLoc DL(Op);
11201
11202 // We chose ZeroOrOneBooleanContents, so use zero and one.
11203 EVT VT = Op.getValueType();
11204 SDValue TVal = DAG.getConstant(1, DL, VT);
11205 SDValue FVal = DAG.getConstant(0, DL, VT);
11206
11207 // Handle f128 first, since one possible outcome is a normal integer
11208 // comparison which gets picked up by the next if statement.
11209 if (LHS.getValueType() == MVT::f128) {
11210 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11211 IsSignaling);
11212
11213 // If softenSetCCOperands returned a scalar, use it.
11214 if (!RHS.getNode()) {
11215 assert(LHS.getValueType() == Op.getValueType() &&
11216 "Unexpected setcc expansion!");
11217 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11218 }
11219 }
11220
11221 if (LHS.getValueType().isInteger()) {
11222
11223 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11224
11225 SDValue CCVal;
11227 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11228
11229 // Note that we inverted the condition above, so we reverse the order of
11230 // the true and false operands here. This will allow the setcc to be
11231 // matched to a single CSINC instruction.
11232 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11233 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11234 }
11235
11236 // Now we know we're dealing with FP values.
11237 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11238 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11239
11240 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11241 // and do the comparison.
11242 SDValue Cmp;
11243 if (IsStrict)
11244 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11245 else
11246 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11247
11248 AArch64CC::CondCode CC1, CC2;
11249 changeFPCCToAArch64CC(CC, CC1, CC2);
11250 SDValue Res;
11251 if (CC2 == AArch64CC::AL) {
11252 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11253 CC2);
11254 SDValue CC1Val = getCondCode(DAG, CC1);
11255
11256 // Note that we inverted the condition above, so we reverse the order of
11257 // the true and false operands here. This will allow the setcc to be
11258 // matched to a single CSINC instruction.
11259 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11260 } else {
11261 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11262 // totally clean. Some of them require two CSELs to implement. As is in
11263 // this case, we emit the first CSEL and then emit a second using the output
11264 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11265
11266 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11267 SDValue CC1Val = getCondCode(DAG, CC1);
11268 SDValue CS1 =
11269 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11270
11271 SDValue CC2Val = getCondCode(DAG, CC2);
11272 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11273 }
11274 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11275}
11276
11277SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11278 SelectionDAG &DAG) const {
11279
11280 SDValue LHS = Op.getOperand(0);
11281 SDValue RHS = Op.getOperand(1);
11282 EVT VT = LHS.getValueType();
11283 if (VT != MVT::i32 && VT != MVT::i64)
11284 return SDValue();
11285
11286 SDLoc DL(Op);
11287 SDValue Carry = Op.getOperand(2);
11288 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11289 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11290 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11291 LHS, RHS, InvCarry);
11292
11293 EVT OpVT = Op.getValueType();
11294 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11295 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11296
11297 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11299 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11300 // Inputs are swapped because the condition is inverted. This will allow
11301 // matching with a single CSINC instruction.
11302 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11303 Cmp.getValue(1));
11304}
11305
11306/// Emit vector comparison for floating-point values, producing a mask.
11308 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11309 const SDLoc &DL, SelectionDAG &DAG) {
11310 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11311 "function only supposed to emit natural comparisons");
11312
11313 switch (CC) {
11314 default:
11315 return SDValue();
11316 case AArch64CC::NE: {
11317 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11318 // Use vector semantics for the inversion to potentially save a copy between
11319 // SIMD and regular registers.
11320 if (!LHS.getValueType().isVector()) {
11321 EVT VecVT =
11322 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11323 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11324 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11325 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11326 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11327 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11328 }
11329 return DAG.getNOT(DL, Fcmeq, VT);
11330 }
11331 case AArch64CC::EQ:
11332 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11333 case AArch64CC::GE:
11334 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11335 case AArch64CC::GT:
11336 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11337 case AArch64CC::LE:
11338 if (!NoNans)
11339 return SDValue();
11340 // If we ignore NaNs then we can use to the LS implementation.
11341 [[fallthrough]];
11342 case AArch64CC::LS:
11343 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11344 case AArch64CC::LT:
11345 if (!NoNans)
11346 return SDValue();
11347 // If we ignore NaNs then we can use to the MI implementation.
11348 [[fallthrough]];
11349 case AArch64CC::MI:
11350 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11351 }
11352}
11353
11354/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11355/// values are scalars, try to emit a mask generating vector instruction.
11357 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11358 const SDLoc &DL, SelectionDAG &DAG) {
11359 assert(!LHS.getValueType().isVector());
11360 assert(!RHS.getValueType().isVector());
11361
11362 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11363 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11364 if (!CTVal || !CFVal)
11365 return {};
11366 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11367 !(CTVal->isZero() && CFVal->isAllOnes()))
11368 return {};
11369
11370 if (CTVal->isZero())
11371 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11372
11373 EVT VT = TVal.getValueType();
11374 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11375 return {};
11376
11377 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11378 bool OneNaN = false;
11379 if (LHS == RHS) {
11380 OneNaN = true;
11381 } else if (DAG.isKnownNeverNaN(RHS)) {
11382 OneNaN = true;
11383 RHS = LHS;
11384 } else if (DAG.isKnownNeverNaN(LHS)) {
11385 OneNaN = true;
11386 LHS = RHS;
11387 }
11388 if (OneNaN)
11389 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11390 }
11391
11394 bool ShouldInvert = false;
11395 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11396 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11397 SDValue Cmp2;
11398 if (CC2 != AArch64CC::AL) {
11399 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11400 if (!Cmp2)
11401 return {};
11402 }
11403 if (!Cmp2 && !ShouldInvert)
11404 return Cmp;
11405
11406 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11407 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11408 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11409 Zero);
11410 if (Cmp2) {
11411 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11412 Cmp2, Zero);
11413 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11414 }
11415 if (ShouldInvert)
11416 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11417 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11418 return Cmp;
11419}
11420
11421SDValue AArch64TargetLowering::LowerSELECT_CC(
11422 ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
11424 const SDLoc &DL, SelectionDAG &DAG) const {
11425 // Handle f128 first, because it will result in a comparison of some RTLIB
11426 // call result against zero.
11427 if (LHS.getValueType() == MVT::f128) {
11428 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11429
11430 // If softenSetCCOperands returned a scalar, we need to compare the result
11431 // against zero to select between true and false values.
11432 if (!RHS.getNode()) {
11433 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11434 CC = ISD::SETNE;
11435 }
11436 }
11437
11438 // Also handle f16, for which we need to do a f32 comparison.
11439 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11440 LHS.getValueType() == MVT::bf16) {
11441 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11442 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11443 }
11444
11445 // Next, handle integers.
11446 if (LHS.getValueType().isInteger()) {
11447 assert((LHS.getValueType() == RHS.getValueType()) &&
11448 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11449
11450 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11451 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11452 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11453
11454 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11455 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11456 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11457 // Both require less instructions than compare and conditional select.
11458 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11459 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11460 LHS.getValueType() == RHS.getValueType()) {
11461 EVT VT = LHS.getValueType();
11462 SDValue Shift =
11463 DAG.getNode(ISD::SRA, DL, VT, LHS,
11464 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11465
11466 if (CC == ISD::SETGT)
11467 Shift = DAG.getNOT(DL, Shift, VT);
11468
11469 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11470 }
11471
11472 // Canonicalise absolute difference patterns:
11473 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11474 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11475 //
11476 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11477 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11478 // The second forms can be matched into subs+cneg.
11479 // NOTE: Drop poison generating flags from the negated operand to avoid
11480 // inadvertently propagating poison after the canonicalisation.
11481 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11482 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11483 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11485 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11486 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11487 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11489 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11490 }
11491 }
11492
11493 unsigned Opcode = AArch64ISD::CSEL;
11494
11495 // If both the TVal and the FVal are constants, see if we can swap them in
11496 // order to for a CSINV or CSINC out of them.
11497 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11498 std::swap(TVal, FVal);
11499 std::swap(CTVal, CFVal);
11500 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11501 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11502 std::swap(TVal, FVal);
11503 std::swap(CTVal, CFVal);
11504 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11505 } else if (TVal.getOpcode() == ISD::XOR) {
11506 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11507 // with a CSINV rather than a CSEL.
11508 if (isAllOnesConstant(TVal.getOperand(1))) {
11509 std::swap(TVal, FVal);
11510 std::swap(CTVal, CFVal);
11511 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11512 }
11513 } else if (TVal.getOpcode() == ISD::SUB) {
11514 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11515 // that we can match with a CSNEG rather than a CSEL.
11516 if (isNullConstant(TVal.getOperand(0))) {
11517 std::swap(TVal, FVal);
11518 std::swap(CTVal, CFVal);
11519 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11520 }
11521 } else if (CTVal && CFVal) {
11522 const int64_t TrueVal = CTVal->getSExtValue();
11523 const int64_t FalseVal = CFVal->getSExtValue();
11524 bool Swap = false;
11525
11526 // If both TVal and FVal are constants, see if FVal is the
11527 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11528 // instead of a CSEL in that case.
11529 if (TrueVal == ~FalseVal) {
11530 Opcode = AArch64ISD::CSINV;
11531 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11532 TrueVal == -FalseVal) {
11533 Opcode = AArch64ISD::CSNEG;
11534 } else if (TVal.getValueType() == MVT::i32) {
11535 // If our operands are only 32-bit wide, make sure we use 32-bit
11536 // arithmetic for the check whether we can use CSINC. This ensures that
11537 // the addition in the check will wrap around properly in case there is
11538 // an overflow (which would not be the case if we do the check with
11539 // 64-bit arithmetic).
11540 const uint32_t TrueVal32 = CTVal->getZExtValue();
11541 const uint32_t FalseVal32 = CFVal->getZExtValue();
11542
11543 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11544 Opcode = AArch64ISD::CSINC;
11545
11546 if (TrueVal32 > FalseVal32) {
11547 Swap = true;
11548 }
11549 }
11550 } else {
11551 // 64-bit check whether we can use CSINC.
11552 const uint64_t TrueVal64 = TrueVal;
11553 const uint64_t FalseVal64 = FalseVal;
11554
11555 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11556 Opcode = AArch64ISD::CSINC;
11557
11558 if (TrueVal > FalseVal) {
11559 Swap = true;
11560 }
11561 }
11562 }
11563
11564 // Swap TVal and FVal if necessary.
11565 if (Swap) {
11566 std::swap(TVal, FVal);
11567 std::swap(CTVal, CFVal);
11568 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11569 }
11570
11571 if (Opcode != AArch64ISD::CSEL) {
11572 // Drop FVal since we can get its value by simply inverting/negating
11573 // TVal.
11574 FVal = TVal;
11575 }
11576 }
11577
11578 // Avoid materializing a constant when possible by reusing a known value in
11579 // a register. However, don't perform this optimization if the known value
11580 // is one, zero or negative one in the case of a CSEL. We can always
11581 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11582 // FVal, respectively.
11583 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11584 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11585 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11587 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11588 // "a != C ? x : a" to avoid materializing C.
11589 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11590 TVal = LHS;
11591 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11592 FVal = LHS;
11593 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11594 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11595 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11596 // avoid materializing C.
11598 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11599 Opcode = AArch64ISD::CSINV;
11600 TVal = LHS;
11601 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11602 }
11603 }
11604
11605 SDValue CCVal;
11606 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11607 EVT VT = TVal.getValueType();
11608 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11609 }
11610
11611 // Now we know we're dealing with FP values.
11612 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11613 LHS.getValueType() == MVT::f64);
11614 assert(LHS.getValueType() == RHS.getValueType());
11615 EVT VT = TVal.getValueType();
11616
11617 // If the purpose of the comparison is to select between all ones
11618 // or all zeros, try to use a vector comparison because the operands are
11619 // already stored in SIMD registers.
11620 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11621 switch (U->getOpcode()) {
11622 default:
11623 return false;
11624 case ISD::INSERT_VECTOR_ELT:
11625 case ISD::SCALAR_TO_VECTOR:
11626 case AArch64ISD::DUP:
11627 return true;
11628 }
11629 })) {
11630 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11631 SDValue VectorCmp =
11632 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11633 if (VectorCmp)
11634 return VectorCmp;
11635 }
11636
11637 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11638
11639 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11640 // clean. Some of them require two CSELs to implement.
11641 AArch64CC::CondCode CC1, CC2;
11642 changeFPCCToAArch64CC(CC, CC1, CC2);
11643
11644 if (Flags.hasNoSignedZeros()) {
11645 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11646 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11647 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11648 if (RHSVal && RHSVal->isZero()) {
11649 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11650 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11651
11652 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11653 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11654 TVal = LHS;
11655 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11656 CFVal && CFVal->isZero() &&
11657 FVal.getValueType() == LHS.getValueType())
11658 FVal = LHS;
11659 }
11660 }
11661
11662 // Emit first, and possibly only, CSEL.
11663 SDValue CC1Val = getCondCode(DAG, CC1);
11664 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11665
11666 // If we need a second CSEL, emit it, using the output of the first as the
11667 // RHS. We're effectively OR'ing the two CC's together.
11668 if (CC2 != AArch64CC::AL) {
11669 SDValue CC2Val = getCondCode(DAG, CC2);
11670 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11671 }
11672
11673 // Otherwise, return the output of the first CSEL.
11674 return CS1;
11675}
11676
11677SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11678 SelectionDAG &DAG) const {
11679 EVT Ty = Op.getValueType();
11680 auto Idx = Op.getConstantOperandAPInt(2);
11681 int64_t IdxVal = Idx.getSExtValue();
11682 assert(Ty.isScalableVector() &&
11683 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11684
11685 // We can use the splice instruction for certain index values where we are
11686 // able to efficiently generate the correct predicate. The index will be
11687 // inverted and used directly as the input to the ptrue instruction, i.e.
11688 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11689 // splice predicate. However, we can only do this if we can guarantee that
11690 // there are enough elements in the vector, hence we check the index <= min
11691 // number of elements.
11692 std::optional<unsigned> PredPattern;
11693 if (Ty.isScalableVector() && IdxVal < 0 &&
11694 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11695 std::nullopt) {
11696 SDLoc DL(Op);
11697
11698 // Create a predicate where all but the last -IdxVal elements are false.
11699 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11700 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11701 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11702
11703 // Now splice the two inputs together using the predicate.
11704 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11705 Op.getOperand(1));
11706 }
11707
11708 // We can select to an EXT instruction when indexing the first 256 bytes.
11710 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11711 return Op;
11712
11713 return SDValue();
11714}
11715
11716SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11717 SelectionDAG &DAG) const {
11718 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11719 SDValue LHS = Op.getOperand(0);
11720 SDValue RHS = Op.getOperand(1);
11721 SDValue TVal = Op.getOperand(2);
11722 SDValue FVal = Op.getOperand(3);
11723 SDNodeFlags Flags = Op->getFlags();
11724 SDLoc DL(Op);
11725 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
11726}
11727
11728SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11729 SelectionDAG &DAG) const {
11730 SDValue CCVal = Op->getOperand(0);
11731 SDValue TVal = Op->getOperand(1);
11732 SDValue FVal = Op->getOperand(2);
11733 SDLoc DL(Op);
11734
11735 EVT Ty = Op.getValueType();
11736 if (Ty == MVT::aarch64svcount) {
11737 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11738 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11739 SDValue Sel =
11740 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11741 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11742 }
11743
11744 if (Ty.isScalableVector()) {
11745 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11746 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11747 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11748 }
11749
11750 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11751 // FIXME: Ideally this would be the same as above using i1 types, however
11752 // for the moment we can't deal with fixed i1 vector types properly, so
11753 // instead extend the predicate to a result type sized integer vector.
11754 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11755 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11756 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11757 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11758 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11759 }
11760
11761 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11762 // instruction.
11763 if (ISD::isOverflowIntrOpRes(CCVal)) {
11764 // Only lower legal XALUO ops.
11765 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11766 return SDValue();
11767
11769 SDValue Value, Overflow;
11770 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11771 SDValue CCVal = getCondCode(DAG, OFCC);
11772
11773 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11774 CCVal, Overflow);
11775 }
11776
11777 // Lower it the same way as we would lower a SELECT_CC node.
11778 ISD::CondCode CC;
11779 SDValue LHS, RHS;
11780 if (CCVal.getOpcode() == ISD::SETCC) {
11781 LHS = CCVal.getOperand(0);
11782 RHS = CCVal.getOperand(1);
11783 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11784 } else {
11785 LHS = CCVal;
11786 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11787 CC = ISD::SETNE;
11788 }
11789
11790 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11791 // order to use FCSELSrrr
11792 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11793 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11794 DAG.getUNDEF(MVT::f32), TVal);
11795 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11796 DAG.getUNDEF(MVT::f32), FVal);
11797 }
11798
11799 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
11800 Op->getFlags(), DL, DAG);
11801
11802 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11803 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11804 }
11805
11806 return Res;
11807}
11808
11809SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11810 SelectionDAG &DAG) const {
11811 // Jump table entries as PC relative offsets. No additional tweaking
11812 // is necessary here. Just get the address of the jump table.
11813 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11814
11817 !Subtarget->isTargetMachO())
11818 return getAddrLarge(JT, DAG);
11819 if (CM == CodeModel::Tiny)
11820 return getAddrTiny(JT, DAG);
11821 return getAddr(JT, DAG);
11822}
11823
11824SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11825 SelectionDAG &DAG) const {
11826 // Jump table entries as PC relative offsets. No additional tweaking
11827 // is necessary here. Just get the address of the jump table.
11828 SDLoc DL(Op);
11829 SDValue JT = Op.getOperand(1);
11830 SDValue Entry = Op.getOperand(2);
11831 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11832
11833 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11834 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11835
11836 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11837 // sequence later, to guarantee the integrity of the intermediate values.
11839 "aarch64-jump-table-hardening")) {
11841 if (Subtarget->isTargetMachO()) {
11842 if (CM != CodeModel::Small && CM != CodeModel::Large)
11843 report_fatal_error("Unsupported code-model for hardened jump-table");
11844 } else {
11845 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11846 assert(Subtarget->isTargetELF() &&
11847 "jump table hardening only supported on MachO/ELF");
11848 if (CM != CodeModel::Small)
11849 report_fatal_error("Unsupported code-model for hardened jump-table");
11850 }
11851
11852 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11853 Entry, SDValue());
11854 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11855 DAG.getTargetJumpTable(JTI, MVT::i32),
11856 X16Copy.getValue(0), X16Copy.getValue(1));
11857 return SDValue(B, 0);
11858 }
11859
11860 SDNode *Dest =
11861 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11862 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11863 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11864 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11865}
11866
11867SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11868 SDValue Chain = Op.getOperand(0);
11869 SDValue Dest = Op.getOperand(1);
11870
11871 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11872 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11873 if (Dest->isMachineOpcode() &&
11874 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11875 return SDValue();
11876
11877 const MachineFunction &MF = DAG.getMachineFunction();
11878 std::optional<uint16_t> BADisc =
11880 if (!BADisc)
11881 return SDValue();
11882
11883 SDLoc DL(Op);
11884
11885 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11887 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11888
11889 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
11890 {Dest, Key, Disc, AddrDisc, Chain});
11891 return SDValue(BrA, 0);
11892}
11893
11894SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11895 SelectionDAG &DAG) const {
11896 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11898 if (CM == CodeModel::Large) {
11899 // Use the GOT for the large code model on iOS.
11900 if (Subtarget->isTargetMachO()) {
11901 return getGOT(CP, DAG);
11902 }
11904 return getAddrLarge(CP, DAG);
11905 } else if (CM == CodeModel::Tiny) {
11906 return getAddrTiny(CP, DAG);
11907 }
11908 return getAddr(CP, DAG);
11909}
11910
11911SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11912 SelectionDAG &DAG) const {
11913 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
11914 const BlockAddress *BA = BAN->getBlockAddress();
11915
11916 if (std::optional<uint16_t> BADisc =
11918 *BA->getFunction())) {
11919 SDLoc DL(Op);
11920
11921 // This isn't cheap, but BRIND is rare.
11922 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
11923
11924 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11925
11927 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11928
11929 SDNode *MOV =
11930 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
11931 {TargetBA, Key, AddrDisc, Disc});
11932 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
11933 SDValue(MOV, 1));
11934 }
11935
11937 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11939 return getAddrLarge(BAN, DAG);
11940 } else if (CM == CodeModel::Tiny) {
11941 return getAddrTiny(BAN, DAG);
11942 }
11943 return getAddr(BAN, DAG);
11944}
11945
11946SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11947 SelectionDAG &DAG) const {
11948 AArch64FunctionInfo *FuncInfo =
11950
11951 SDLoc DL(Op);
11952 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
11954 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
11955 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11956 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11957 MachinePointerInfo(SV));
11958}
11959
11960SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11961 SelectionDAG &DAG) const {
11964
11965 SDLoc DL(Op);
11966 SDValue FR;
11967 if (Subtarget->isWindowsArm64EC()) {
11968 // With the Arm64EC ABI, we compute the address of the varargs save area
11969 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11970 // but calls from an entry thunk can pass in a different address.
11971 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
11972 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
11974 if (FuncInfo->getVarArgsGPRSize() > 0)
11975 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11976 else
11977 StackOffset = FuncInfo->getVarArgsStackOffset();
11978 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
11979 DAG.getConstant(StackOffset, DL, MVT::i64));
11980 } else {
11981 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
11982 ? FuncInfo->getVarArgsGPRIndex()
11983 : FuncInfo->getVarArgsStackIndex(),
11985 }
11986 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11987 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11988 MachinePointerInfo(SV));
11989}
11990
11991SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11992 SelectionDAG &DAG) const {
11993 // The layout of the va_list struct is specified in the AArch64 Procedure Call
11994 // Standard, section B.3.
11997 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11998 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11999 auto PtrVT = getPointerTy(DAG.getDataLayout());
12000 SDLoc DL(Op);
12001
12002 SDValue Chain = Op.getOperand(0);
12003 SDValue VAList = Op.getOperand(1);
12004 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12006
12007 // void *__stack at offset 0
12008 unsigned Offset = 0;
12009 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12010 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12011 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12012 MachinePointerInfo(SV), Align(PtrSize)));
12013
12014 // void *__gr_top at offset 8 (4 on ILP32)
12015 Offset += PtrSize;
12016 int GPRSize = FuncInfo->getVarArgsGPRSize();
12017 if (GPRSize > 0) {
12018 SDValue GRTop, GRTopAddr;
12019
12020 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12021 DAG.getConstant(Offset, DL, PtrVT));
12022
12023 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12024 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12025 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12026 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12027
12028 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12030 Align(PtrSize)));
12031 }
12032
12033 // void *__vr_top at offset 16 (8 on ILP32)
12034 Offset += PtrSize;
12035 int FPRSize = FuncInfo->getVarArgsFPRSize();
12036 if (FPRSize > 0) {
12037 SDValue VRTop, VRTopAddr;
12038 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12039 DAG.getConstant(Offset, DL, PtrVT));
12040
12041 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12042 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12043 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12044 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12045
12046 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12048 Align(PtrSize)));
12049 }
12050
12051 // int __gr_offs at offset 24 (12 on ILP32)
12052 Offset += PtrSize;
12053 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12054 DAG.getConstant(Offset, DL, PtrVT));
12055 MemOps.push_back(
12056 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12057 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12058
12059 // int __vr_offs at offset 28 (16 on ILP32)
12060 Offset += 4;
12061 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12062 DAG.getConstant(Offset, DL, PtrVT));
12063 MemOps.push_back(
12064 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12065 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12066
12067 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12068}
12069
12070SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12071 SelectionDAG &DAG) const {
12073 Function &F = MF.getFunction();
12074
12075 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12076 return LowerWin64_VASTART(Op, DAG);
12077 else if (Subtarget->isTargetDarwin())
12078 return LowerDarwin_VASTART(Op, DAG);
12079 else
12080 return LowerAAPCS_VASTART(Op, DAG);
12081}
12082
12083SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12084 SelectionDAG &DAG) const {
12085 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12086 // pointer.
12087 SDLoc DL(Op);
12088 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12089 unsigned VaListSize =
12090 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12091 ? PtrSize
12092 : Subtarget->isTargetILP32() ? 20 : 32;
12093 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12094 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12095
12096 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12097 DAG.getConstant(VaListSize, DL, MVT::i32),
12098 Align(PtrSize), false, false, /*CI=*/nullptr,
12099 std::nullopt, MachinePointerInfo(DestSV),
12100 MachinePointerInfo(SrcSV));
12101}
12102
12103SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12104 assert(Subtarget->isTargetDarwin() &&
12105 "automatic va_arg instruction only works on Darwin");
12106
12107 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12108 EVT VT = Op.getValueType();
12109 SDLoc DL(Op);
12110 SDValue Chain = Op.getOperand(0);
12111 SDValue Addr = Op.getOperand(1);
12112 MaybeAlign Align(Op.getConstantOperandVal(3));
12113 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12114 auto PtrVT = getPointerTy(DAG.getDataLayout());
12115 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12116 SDValue VAList =
12117 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12118 Chain = VAList.getValue(1);
12119 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12120
12121 if (VT.isScalableVector())
12122 report_fatal_error("Passing SVE types to variadic functions is "
12123 "currently not supported");
12124
12125 if (Align && *Align > MinSlotSize) {
12126 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12127 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12128 VAList =
12129 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12130 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12131 }
12132
12133 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12134 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12135
12136 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12137 // up to 64 bits. At the very least, we have to increase the striding of the
12138 // vaargs list to match this, and for FP values we need to introduce
12139 // FP_ROUND nodes as well.
12140 if (VT.isInteger() && !VT.isVector())
12141 ArgSize = std::max(ArgSize, MinSlotSize);
12142 bool NeedFPTrunc = false;
12143 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12144 ArgSize = 8;
12145 NeedFPTrunc = true;
12146 }
12147
12148 // Increment the pointer, VAList, to the next vaarg
12149 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12150 DAG.getConstant(ArgSize, DL, PtrVT));
12151 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12152
12153 // Store the incremented VAList to the legalized pointer
12154 SDValue APStore =
12155 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12156
12157 // Load the actual argument out of the pointer VAList
12158 if (NeedFPTrunc) {
12159 // Load the value as an f64.
12160 SDValue WideFP =
12161 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12162 // Round the value down to an f32.
12163 SDValue NarrowFP =
12164 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12165 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12166 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12167 // Merge the rounded value with the chain output of the load.
12168 return DAG.getMergeValues(Ops, DL);
12169 }
12170
12171 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12172}
12173
12174SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12175 SelectionDAG &DAG) const {
12177 MFI.setFrameAddressIsTaken(true);
12178
12179 EVT VT = Op.getValueType();
12180 SDLoc DL(Op);
12181 unsigned Depth = Op.getConstantOperandVal(0);
12182 SDValue FrameAddr =
12183 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12184 while (Depth--)
12185 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12187
12188 if (Subtarget->isTargetILP32())
12189 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12190 DAG.getValueType(VT));
12191
12192 return FrameAddr;
12193}
12194
12195SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12196 SelectionDAG &DAG) const {
12198
12199 EVT VT = getPointerTy(DAG.getDataLayout());
12200 int FI = MFI.CreateFixedObject(4, 0, false);
12201 return DAG.getFrameIndex(FI, VT);
12202}
12203
12204#define GET_REGISTER_MATCHER
12205#include "AArch64GenAsmMatcher.inc"
12206
12207// FIXME? Maybe this could be a TableGen attribute on some registers and
12208// this table could be generated automatically from RegInfo.
12209Register AArch64TargetLowering::
12210getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12212 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12213 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12214 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12215 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12216 !MRI->isReservedReg(MF, Reg))
12217 Reg = Register();
12218 }
12219 return Reg;
12220}
12221
12222SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12223 SelectionDAG &DAG) const {
12225
12226 EVT VT = Op.getValueType();
12227 SDLoc DL(Op);
12228
12229 SDValue FrameAddr =
12230 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12232
12233 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12234}
12235
12236SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12237 SelectionDAG &DAG) const {
12239 MachineFrameInfo &MFI = MF.getFrameInfo();
12240 MFI.setReturnAddressIsTaken(true);
12241
12242 EVT VT = Op.getValueType();
12243 SDLoc DL(Op);
12244 unsigned Depth = Op.getConstantOperandVal(0);
12245 SDValue ReturnAddress;
12246 if (Depth) {
12247 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12249 ReturnAddress = DAG.getLoad(
12250 VT, DL, DAG.getEntryNode(),
12251 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12252 } else {
12253 // Return LR, which contains the return address. Mark it an implicit
12254 // live-in.
12255 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12256 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12257 }
12258
12259 // The XPACLRI instruction assembles to a hint-space instruction before
12260 // Armv8.3-A therefore this instruction can be safely used for any pre
12261 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12262 // that instead.
12263 SDNode *St;
12264 if (Subtarget->hasPAuth()) {
12265 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12266 } else {
12267 // XPACLRI operates on LR therefore we must move the operand accordingly.
12268 SDValue Chain =
12269 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12270 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12271 }
12272 return SDValue(St, 0);
12273}
12274
12275/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12276/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12277SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12278 SelectionDAG &DAG) const {
12279 SDValue Lo, Hi;
12280 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12281 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12282}
12283
12285 const GlobalAddressSDNode *GA) const {
12286 // Offsets are folded in the DAG combine rather than here so that we can
12287 // intelligently choose an offset based on the uses.
12288 return false;
12289}
12290
12292 bool OptForSize) const {
12293 bool IsLegal = false;
12294 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12295 // 16-bit case when target has full fp16 support.
12296 // We encode bf16 bit patterns as if they were fp16. This results in very
12297 // strange looking assembly but should populate the register with appropriate
12298 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12299 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12300 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12301 // FIXME: We should be able to handle f128 as well with a clever lowering.
12302 const APInt ImmInt = Imm.bitcastToAPInt();
12303 if (VT == MVT::f64)
12304 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12305 else if (VT == MVT::f32)
12306 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12307 else if (VT == MVT::f16 || VT == MVT::bf16)
12308 IsLegal =
12309 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12310 Imm.isPosZero();
12311
12312 // If we can not materialize in immediate field for fmov, check if the
12313 // value can be encoded as the immediate operand of a logical instruction.
12314 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12315 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12316 // generate that fmov.
12317 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12318 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12319 // however the mov+fmov sequence is always better because of the reduced
12320 // cache pressure. The timings are still the same if you consider
12321 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12322 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12325 assert(Insn.size() <= 4 &&
12326 "Should be able to build any value with at most 4 moves");
12327 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12328 IsLegal = Insn.size() <= Limit;
12329 }
12330
12331 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12332 << " imm value: "; Imm.dump(););
12333 return IsLegal;
12334}
12335
12336//===----------------------------------------------------------------------===//
12337// AArch64 Optimization Hooks
12338//===----------------------------------------------------------------------===//
12339
12340static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12341 SDValue Operand, SelectionDAG &DAG,
12342 int &ExtraSteps) {
12343 EVT VT = Operand.getValueType();
12344 if ((ST->hasNEON() &&
12345 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12346 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12347 VT == MVT::v4f32)) ||
12348 (ST->hasSVE() &&
12349 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12351 // For the reciprocal estimates, convergence is quadratic, so the number
12352 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12353 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12354 // the result for float (23 mantissa bits) is 2 and for double (52
12355 // mantissa bits) is 3.
12356 constexpr unsigned AccurateBits = 8;
12357 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12358 ExtraSteps = DesiredBits <= AccurateBits
12359 ? 0
12360 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12361 }
12362
12363 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12364 }
12365
12366 return SDValue();
12367}
12368
12369SDValue
12370AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12371 const DenormalMode &Mode) const {
12372 SDLoc DL(Op);
12373 EVT VT = Op.getValueType();
12374 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12375 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12376 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12377}
12378
12379SDValue
12380AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12381 SelectionDAG &DAG) const {
12382 return Op;
12383}
12384
12385SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12386 SelectionDAG &DAG, int Enabled,
12387 int &ExtraSteps,
12388 bool &UseOneConst,
12389 bool Reciprocal) const {
12391 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12392 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12393 DAG, ExtraSteps)) {
12394 SDLoc DL(Operand);
12395 EVT VT = Operand.getValueType();
12396
12397 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12400
12401 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12402 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12403 for (int i = ExtraSteps; i > 0; --i) {
12404 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12405 Flags);
12406 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12407 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12408 }
12409 if (!Reciprocal)
12410 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12411
12412 ExtraSteps = 0;
12413 return Estimate;
12414 }
12415
12416 return SDValue();
12417}
12418
12419SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12420 SelectionDAG &DAG, int Enabled,
12421 int &ExtraSteps) const {
12423 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12424 DAG, ExtraSteps)) {
12425 SDLoc DL(Operand);
12426 EVT VT = Operand.getValueType();
12427
12429
12430 // Newton reciprocal iteration: E * (2 - X * E)
12431 // AArch64 reciprocal iteration instruction: (2 - M * N)
12432 for (int i = ExtraSteps; i > 0; --i) {
12433 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12434 Estimate, Flags);
12435 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12436 }
12437
12438 ExtraSteps = 0;
12439 return Estimate;
12440 }
12441
12442 return SDValue();
12443}
12444
12445//===----------------------------------------------------------------------===//
12446// AArch64 Inline Assembly Support
12447//===----------------------------------------------------------------------===//
12448
12449// Table of Constraints
12450// TODO: This is the current set of constraints supported by ARM for the
12451// compiler, not all of them may make sense.
12452//
12453// r - A general register
12454// w - An FP/SIMD register of some size in the range v0-v31
12455// x - An FP/SIMD register of some size in the range v0-v15
12456// I - Constant that can be used with an ADD instruction
12457// J - Constant that can be used with a SUB instruction
12458// K - Constant that can be used with a 32-bit logical instruction
12459// L - Constant that can be used with a 64-bit logical instruction
12460// M - Constant that can be used as a 32-bit MOV immediate
12461// N - Constant that can be used as a 64-bit MOV immediate
12462// Q - A memory reference with base register and no offset
12463// S - A symbolic address
12464// Y - Floating point constant zero
12465// Z - Integer constant zero
12466//
12467// Note that general register operands will be output using their 64-bit x
12468// register name, whatever the size of the variable, unless the asm operand
12469// is prefixed by the %w modifier. Floating-point and SIMD register operands
12470// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12471// %q modifier.
12472const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12473 // At this point, we have to lower this constraint to something else, so we
12474 // lower it to an "r" or "w". However, by doing this we will force the result
12475 // to be in register, while the X constraint is much more permissive.
12476 //
12477 // Although we are correct (we are free to emit anything, without
12478 // constraints), we might break use cases that would expect us to be more
12479 // efficient and emit something else.
12480 if (!Subtarget->hasFPARMv8())
12481 return "r";
12482
12483 if (ConstraintVT.isFloatingPoint())
12484 return "w";
12485
12486 if (ConstraintVT.isVector() &&
12487 (ConstraintVT.getSizeInBits() == 64 ||
12488 ConstraintVT.getSizeInBits() == 128))
12489 return "w";
12490
12491 return "r";
12492}
12493
12495
12496// Returns a {Reg, RegisterClass} tuple if the constraint is
12497// a specific predicate register.
12498//
12499// For some constraint like "{pn3}" the default path in
12500// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12501// suitable register class for this register is "PPRorPNR", after which it
12502// determines that nxv16i1 is an appropriate type for the constraint, which is
12503// not what we want. The code here pre-empts this by matching the register
12504// explicitly.
12505static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12507 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12508 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12509 return std::nullopt;
12510
12511 bool IsPredicate = Constraint[1] == 'p';
12512 Constraint = Constraint.substr(2, Constraint.size() - 3);
12513 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12514 if (IsPredicateAsCount)
12515 Constraint = Constraint.drop_front(1);
12516
12517 unsigned V;
12518 if (Constraint.getAsInteger(10, V) || V > 31)
12519 return std::nullopt;
12520
12521 if (IsPredicateAsCount)
12522 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12523 if (IsPredicate)
12524 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12525 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12526}
12527
12528static std::optional<PredicateConstraint>
12531 .Case("Uph", PredicateConstraint::Uph)
12532 .Case("Upl", PredicateConstraint::Upl)
12533 .Case("Upa", PredicateConstraint::Upa)
12534 .Default(std::nullopt);
12535}
12536
12537static const TargetRegisterClass *
12539 if (VT != MVT::aarch64svcount &&
12540 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12541 return nullptr;
12542
12543 switch (Constraint) {
12544 case PredicateConstraint::Uph:
12545 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12546 : &AArch64::PPR_p8to15RegClass;
12547 case PredicateConstraint::Upl:
12548 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12549 : &AArch64::PPR_3bRegClass;
12550 case PredicateConstraint::Upa:
12551 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12552 : &AArch64::PPRRegClass;
12553 }
12554
12555 llvm_unreachable("Missing PredicateConstraint!");
12556}
12557
12559
12560static std::optional<ReducedGprConstraint>
12563 .Case("Uci", ReducedGprConstraint::Uci)
12564 .Case("Ucj", ReducedGprConstraint::Ucj)
12565 .Default(std::nullopt);
12566}
12567
12568static const TargetRegisterClass *
12570 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12571 return nullptr;
12572
12573 switch (Constraint) {
12574 case ReducedGprConstraint::Uci:
12575 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12576 case ReducedGprConstraint::Ucj:
12577 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12578 }
12579
12580 llvm_unreachable("Missing ReducedGprConstraint!");
12581}
12582
12583// The set of cc code supported is from
12584// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12587 .Case("{@cchi}", AArch64CC::HI)
12588 .Case("{@cccs}", AArch64CC::HS)
12589 .Case("{@cclo}", AArch64CC::LO)
12590 .Case("{@ccls}", AArch64CC::LS)
12591 .Case("{@cccc}", AArch64CC::LO)
12592 .Case("{@cceq}", AArch64CC::EQ)
12593 .Case("{@ccgt}", AArch64CC::GT)
12594 .Case("{@ccge}", AArch64CC::GE)
12595 .Case("{@cclt}", AArch64CC::LT)
12596 .Case("{@ccle}", AArch64CC::LE)
12597 .Case("{@cchs}", AArch64CC::HS)
12598 .Case("{@ccne}", AArch64CC::NE)
12599 .Case("{@ccvc}", AArch64CC::VC)
12600 .Case("{@ccpl}", AArch64CC::PL)
12601 .Case("{@ccvs}", AArch64CC::VS)
12602 .Case("{@ccmi}", AArch64CC::MI)
12604 return Cond;
12605}
12606
12607/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12608/// WZR, invert(<cond>)'.
12610 SelectionDAG &DAG) {
12611 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12612 DAG.getConstant(0, DL, MVT::i32),
12613 DAG.getConstant(0, DL, MVT::i32),
12614 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12615}
12616
12617// Lower @cc flag output via getSETCC.
12618SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12619 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12620 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12621 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12622 if (Cond == AArch64CC::Invalid)
12623 return SDValue();
12624 // The output variable should be a scalar integer.
12625 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12626 OpInfo.ConstraintVT.getSizeInBits() < 8)
12627 report_fatal_error("Flag output operand is of invalid type");
12628
12629 // Get NZCV register. Only update chain when copyfrom is glued.
12630 if (Glue.getNode()) {
12631 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12632 Chain = Glue.getValue(1);
12633 } else
12634 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12635 // Extract CC code.
12636 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12637
12639
12640 // Truncate or ZERO_EXTEND based on value types.
12641 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12642 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12643 else
12644 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12645
12646 return Result;
12647}
12648
12649/// getConstraintType - Given a constraint letter, return the type of
12650/// constraint it is for this target.
12652AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12653 if (Constraint.size() == 1) {
12654 switch (Constraint[0]) {
12655 default:
12656 break;
12657 case 'x':
12658 case 'w':
12659 case 'y':
12660 return C_RegisterClass;
12661 // An address with a single base register. Due to the way we
12662 // currently handle addresses it is the same as 'r'.
12663 case 'Q':
12664 return C_Memory;
12665 case 'I':
12666 case 'J':
12667 case 'K':
12668 case 'L':
12669 case 'M':
12670 case 'N':
12671 case 'Y':
12672 case 'Z':
12673 return C_Immediate;
12674 case 'z':
12675 case 'S': // A symbol or label reference with a constant offset
12676 return C_Other;
12677 }
12678 } else if (parsePredicateConstraint(Constraint))
12679 return C_RegisterClass;
12680 else if (parseReducedGprConstraint(Constraint))
12681 return C_RegisterClass;
12682 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12683 return C_Other;
12684 return TargetLowering::getConstraintType(Constraint);
12685}
12686
12687/// Examine constraint type and operand type and determine a weight value.
12688/// This object must already have been set up with the operand type
12689/// and the current alternative constraint selected.
12691AArch64TargetLowering::getSingleConstraintMatchWeight(
12692 AsmOperandInfo &info, const char *constraint) const {
12694 Value *CallOperandVal = info.CallOperandVal;
12695 // If we don't have a value, we can't do a match,
12696 // but allow it at the lowest weight.
12697 if (!CallOperandVal)
12698 return CW_Default;
12699 Type *type = CallOperandVal->getType();
12700 // Look at the constraint type.
12701 switch (*constraint) {
12702 default:
12704 break;
12705 case 'x':
12706 case 'w':
12707 case 'y':
12708 if (type->isFloatingPointTy() || type->isVectorTy())
12709 weight = CW_Register;
12710 break;
12711 case 'z':
12712 weight = CW_Constant;
12713 break;
12714 case 'U':
12715 if (parsePredicateConstraint(constraint) ||
12716 parseReducedGprConstraint(constraint))
12717 weight = CW_Register;
12718 break;
12719 }
12720 return weight;
12721}
12722
12723std::pair<unsigned, const TargetRegisterClass *>
12724AArch64TargetLowering::getRegForInlineAsmConstraint(
12725 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12726 if (Constraint.size() == 1) {
12727 switch (Constraint[0]) {
12728 case 'r':
12729 if (VT.isScalableVector())
12730 return std::make_pair(0U, nullptr);
12731 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12732 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12733 if (VT.getFixedSizeInBits() == 64)
12734 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12735 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12736 case 'w': {
12737 if (!Subtarget->hasFPARMv8())
12738 break;
12739 if (VT.isScalableVector()) {
12740 if (VT.getVectorElementType() != MVT::i1)
12741 return std::make_pair(0U, &AArch64::ZPRRegClass);
12742 return std::make_pair(0U, nullptr);
12743 }
12744 if (VT == MVT::Other)
12745 break;
12746 uint64_t VTSize = VT.getFixedSizeInBits();
12747 if (VTSize == 16)
12748 return std::make_pair(0U, &AArch64::FPR16RegClass);
12749 if (VTSize == 32)
12750 return std::make_pair(0U, &AArch64::FPR32RegClass);
12751 if (VTSize == 64)
12752 return std::make_pair(0U, &AArch64::FPR64RegClass);
12753 if (VTSize == 128)
12754 return std::make_pair(0U, &AArch64::FPR128RegClass);
12755 break;
12756 }
12757 // The instructions that this constraint is designed for can
12758 // only take 128-bit registers so just use that regclass.
12759 case 'x':
12760 if (!Subtarget->hasFPARMv8())
12761 break;
12762 if (VT.isScalableVector())
12763 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12764 if (VT.getSizeInBits() == 128)
12765 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12766 break;
12767 case 'y':
12768 if (!Subtarget->hasFPARMv8())
12769 break;
12770 if (VT.isScalableVector())
12771 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12772 break;
12773 }
12774 } else {
12775 if (const auto P = parseSVERegAsConstraint(Constraint)) {
12776 // SME functions that are not in streaming mode, should
12777 // still observe clobbers of Z-registers by clobbering
12778 // the lower 128bits of those registers.
12779 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
12780 !Subtarget->isSVEorStreamingSVEAvailable())
12781 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
12782 &AArch64::FPR128RegClass);
12783 return *P;
12784 }
12785 if (const auto PC = parsePredicateConstraint(Constraint))
12786 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12787 return std::make_pair(0U, RegClass);
12788
12789 if (const auto RGC = parseReducedGprConstraint(Constraint))
12790 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12791 return std::make_pair(0U, RegClass);
12792 }
12793 if (StringRef("{cc}").equals_insensitive(Constraint) ||
12795 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12796
12797 if (Constraint == "{za}") {
12798 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12799 }
12800
12801 if (Constraint == "{zt0}") {
12802 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12803 }
12804
12805 // Use the default implementation in TargetLowering to convert the register
12806 // constraint into a member of a register class.
12807 std::pair<unsigned, const TargetRegisterClass *> Res;
12809
12810 // Not found as a standard register?
12811 if (!Res.second) {
12812 unsigned Size = Constraint.size();
12813 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12814 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12815 int RegNo;
12816 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12817 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12818 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12819 // By default we'll emit v0-v31 for this unless there's a modifier where
12820 // we'll emit the correct register as well.
12821 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12822 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12823 Res.second = &AArch64::FPR64RegClass;
12824 } else {
12825 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12826 Res.second = &AArch64::FPR128RegClass;
12827 }
12828 }
12829 }
12830 }
12831
12832 if (Res.second && !Subtarget->hasFPARMv8() &&
12833 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12834 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12835 return std::make_pair(0U, nullptr);
12836
12837 return Res;
12838}
12839
12841 llvm::Type *Ty,
12842 bool AllowUnknown) const {
12843 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12844 return EVT(MVT::i64x8);
12845
12846 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12847}
12848
12849/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12850/// vector. If it is invalid, don't add anything to Ops.
12851void AArch64TargetLowering::LowerAsmOperandForConstraint(
12852 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12853 SelectionDAG &DAG) const {
12854 SDValue Result;
12855
12856 // Currently only support length 1 constraints.
12857 if (Constraint.size() != 1)
12858 return;
12859
12860 char ConstraintLetter = Constraint[0];
12861 switch (ConstraintLetter) {
12862 default:
12863 break;
12864
12865 // This set of constraints deal with valid constants for various instructions.
12866 // Validate and return a target constant for them if we can.
12867 case 'z': {
12868 // 'z' maps to xzr or wzr so it needs an input of 0.
12869 if (!isNullConstant(Op))
12870 return;
12871
12872 if (Op.getValueType() == MVT::i64)
12873 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12874 else
12875 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
12876 break;
12877 }
12878 case 'S':
12879 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12880 // supported for PIC while "s" isn't, making "s" less useful. We implement
12881 // "S" but not "s".
12883 break;
12884
12885 case 'I':
12886 case 'J':
12887 case 'K':
12888 case 'L':
12889 case 'M':
12890 case 'N':
12891 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12892 if (!C)
12893 return;
12894
12895 // Grab the value and do some validation.
12896 uint64_t CVal = C->getZExtValue();
12897 switch (ConstraintLetter) {
12898 // The I constraint applies only to simple ADD or SUB immediate operands:
12899 // i.e. 0 to 4095 with optional shift by 12
12900 // The J constraint applies only to ADD or SUB immediates that would be
12901 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12902 // instruction [or vice versa], in other words -1 to -4095 with optional
12903 // left shift by 12.
12904 case 'I':
12905 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
12906 break;
12907 return;
12908 case 'J': {
12909 uint64_t NVal = -C->getSExtValue();
12910 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
12911 CVal = C->getSExtValue();
12912 break;
12913 }
12914 return;
12915 }
12916 // The K and L constraints apply *only* to logical immediates, including
12917 // what used to be the MOVI alias for ORR (though the MOVI alias has now
12918 // been removed and MOV should be used). So these constraints have to
12919 // distinguish between bit patterns that are valid 32-bit or 64-bit
12920 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12921 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12922 // versa.
12923 case 'K':
12924 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12925 break;
12926 return;
12927 case 'L':
12928 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12929 break;
12930 return;
12931 // The M and N constraints are a superset of K and L respectively, for use
12932 // with the MOV (immediate) alias. As well as the logical immediates they
12933 // also match 32 or 64-bit immediates that can be loaded either using a
12934 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12935 // (M) or 64-bit 0x1234000000000000 (N) etc.
12936 // As a note some of this code is liberally stolen from the asm parser.
12937 case 'M': {
12938 if (!isUInt<32>(CVal))
12939 return;
12940 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12941 break;
12942 if ((CVal & 0xFFFF) == CVal)
12943 break;
12944 if ((CVal & 0xFFFF0000ULL) == CVal)
12945 break;
12946 uint64_t NCVal = ~(uint32_t)CVal;
12947 if ((NCVal & 0xFFFFULL) == NCVal)
12948 break;
12949 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12950 break;
12951 return;
12952 }
12953 case 'N': {
12954 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12955 break;
12956 if ((CVal & 0xFFFFULL) == CVal)
12957 break;
12958 if ((CVal & 0xFFFF0000ULL) == CVal)
12959 break;
12960 if ((CVal & 0xFFFF00000000ULL) == CVal)
12961 break;
12962 if ((CVal & 0xFFFF000000000000ULL) == CVal)
12963 break;
12964 uint64_t NCVal = ~CVal;
12965 if ((NCVal & 0xFFFFULL) == NCVal)
12966 break;
12967 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12968 break;
12969 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12970 break;
12971 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12972 break;
12973 return;
12974 }
12975 default:
12976 return;
12977 }
12978
12979 // All assembler immediates are 64-bit integers.
12980 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
12981 break;
12982 }
12983
12984 if (Result.getNode()) {
12985 Ops.push_back(Result);
12986 return;
12987 }
12988
12989 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12990}
12991
12992//===----------------------------------------------------------------------===//
12993// AArch64 Advanced SIMD Support
12994//===----------------------------------------------------------------------===//
12995
12996/// WidenVector - Given a value in the V64 register class, produce the
12997/// equivalent value in the V128 register class.
12999 EVT VT = V64Reg.getValueType();
13000 unsigned NarrowSize = VT.getVectorNumElements();
13001 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13002 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13003 SDLoc DL(V64Reg);
13004
13005 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13006 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13007}
13008
13009/// getExtFactor - Determine the adjustment factor for the position when
13010/// generating an "extract from vector registers" instruction.
13011static unsigned getExtFactor(SDValue &V) {
13012 EVT EltType = V.getValueType().getVectorElementType();
13013 return EltType.getSizeInBits() / 8;
13014}
13015
13016// Check if a vector is built from one vector via extracted elements of
13017// another together with an AND mask, ensuring that all elements fit
13018// within range. This can be reconstructed using AND and NEON's TBL1.
13020 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13021 SDLoc DL(Op);
13022 EVT VT = Op.getValueType();
13023 assert(!VT.isScalableVector() &&
13024 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13025
13026 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13027 // directly to TBL1.
13028 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13029 return SDValue();
13030
13031 unsigned NumElts = VT.getVectorNumElements();
13032 assert((NumElts == 8 || NumElts == 16) &&
13033 "Need to have exactly 8 or 16 elements in vector.");
13034
13035 SDValue SourceVec;
13036 SDValue MaskSourceVec;
13037 SmallVector<SDValue, 16> AndMaskConstants;
13038
13039 for (unsigned i = 0; i < NumElts; ++i) {
13040 SDValue V = Op.getOperand(i);
13041 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13042 return SDValue();
13043
13044 SDValue OperandSourceVec = V.getOperand(0);
13045 if (!SourceVec)
13046 SourceVec = OperandSourceVec;
13047 else if (SourceVec != OperandSourceVec)
13048 return SDValue();
13049
13050 // This only looks at shuffles with elements that are
13051 // a) truncated by a constant AND mask extracted from a mask vector, or
13052 // b) extracted directly from a mask vector.
13053 SDValue MaskSource = V.getOperand(1);
13054 if (MaskSource.getOpcode() == ISD::AND) {
13055 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13056 return SDValue();
13057
13058 AndMaskConstants.push_back(MaskSource.getOperand(1));
13059 MaskSource = MaskSource->getOperand(0);
13060 } else if (!AndMaskConstants.empty()) {
13061 // Either all or no operands should have an AND mask.
13062 return SDValue();
13063 }
13064
13065 // An ANY_EXTEND may be inserted between the AND and the source vector
13066 // extraction. We don't care about that, so we can just skip it.
13067 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13068 MaskSource = MaskSource.getOperand(0);
13069
13070 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13071 return SDValue();
13072
13073 SDValue MaskIdx = MaskSource.getOperand(1);
13074 if (!isa<ConstantSDNode>(MaskIdx) ||
13075 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13076 return SDValue();
13077
13078 // We only apply this if all elements come from the same vector with the
13079 // same vector type.
13080 if (!MaskSourceVec) {
13081 MaskSourceVec = MaskSource->getOperand(0);
13082 if (MaskSourceVec.getValueType() != VT)
13083 return SDValue();
13084 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13085 return SDValue();
13086 }
13087 }
13088
13089 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13090 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13091 // insert, we know that the index in the mask must be smaller than the number
13092 // of elements in the source, or we would have an out-of-bounds access.
13093 if (NumElts == 8)
13094 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13095 DAG.getUNDEF(VT));
13096
13097 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13098 if (!AndMaskConstants.empty())
13099 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13100 DAG.getBuildVector(VT, DL, AndMaskConstants));
13101
13102 return DAG.getNode(
13104 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
13105 MaskSourceVec);
13106}
13107
13108// Gather data to see if the operation can be modelled as a
13109// shuffle in combination with VEXTs.
13111 SelectionDAG &DAG) const {
13112 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13113 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13114 SDLoc DL(Op);
13115 EVT VT = Op.getValueType();
13116 assert(!VT.isScalableVector() &&
13117 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13118 unsigned NumElts = VT.getVectorNumElements();
13119
13120 struct ShuffleSourceInfo {
13121 SDValue Vec;
13122 unsigned MinElt;
13123 unsigned MaxElt;
13124
13125 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13126 // be compatible with the shuffle we intend to construct. As a result
13127 // ShuffleVec will be some sliding window into the original Vec.
13128 SDValue ShuffleVec;
13129
13130 // Code should guarantee that element i in Vec starts at element "WindowBase
13131 // + i * WindowScale in ShuffleVec".
13132 int WindowBase;
13133 int WindowScale;
13134
13135 ShuffleSourceInfo(SDValue Vec)
13136 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13137 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13138
13139 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13140 };
13141
13142 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13143 // node.
13145 for (unsigned i = 0; i < NumElts; ++i) {
13146 SDValue V = Op.getOperand(i);
13147 if (V.isUndef())
13148 continue;
13149 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13150 !isa<ConstantSDNode>(V.getOperand(1)) ||
13151 V.getOperand(0).getValueType().isScalableVector()) {
13152 LLVM_DEBUG(
13153 dbgs() << "Reshuffle failed: "
13154 "a shuffle can only come from building a vector from "
13155 "various elements of other fixed-width vectors, provided "
13156 "their indices are constant\n");
13157 return SDValue();
13158 }
13159
13160 // Add this element source to the list if it's not already there.
13161 SDValue SourceVec = V.getOperand(0);
13162 auto Source = find(Sources, SourceVec);
13163 if (Source == Sources.end())
13164 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13165
13166 // Update the minimum and maximum lane number seen.
13167 unsigned EltNo = V.getConstantOperandVal(1);
13168 Source->MinElt = std::min(Source->MinElt, EltNo);
13169 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13170 }
13171
13172 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13173 // better than moving to/from gpr registers for larger vectors.
13174 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13175 // Construct a mask for the tbl. We may need to adjust the index for types
13176 // larger than i8.
13178 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13179 for (unsigned I = 0; I < NumElts; ++I) {
13180 SDValue V = Op.getOperand(I);
13181 if (V.isUndef()) {
13182 for (unsigned OF = 0; OF < OutputFactor; OF++)
13183 Mask.push_back(-1);
13184 continue;
13185 }
13186 // Set the Mask lanes adjusted for the size of the input and output
13187 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13188 // output element, adjusted in their positions per input and output types.
13189 unsigned Lane = V.getConstantOperandVal(1);
13190 for (unsigned S = 0; S < Sources.size(); S++) {
13191 if (V.getOperand(0) == Sources[S].Vec) {
13192 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13193 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13194 for (unsigned OF = 0; OF < OutputFactor; OF++)
13195 Mask.push_back(InputBase + OF);
13196 break;
13197 }
13198 }
13199 }
13200
13201 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13202 // v16i8, and the TBLMask
13203 SmallVector<SDValue, 16> TBLOperands;
13204 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13205 ? Intrinsic::aarch64_neon_tbl3
13206 : Intrinsic::aarch64_neon_tbl4,
13207 DL, MVT::i32));
13208 for (unsigned i = 0; i < Sources.size(); i++) {
13209 SDValue Src = Sources[i].Vec;
13210 EVT SrcVT = Src.getValueType();
13211 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13212 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13213 "Expected a legally typed vector");
13214 if (SrcVT.is64BitVector())
13215 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13216 DAG.getUNDEF(MVT::v8i8));
13217 TBLOperands.push_back(Src);
13218 }
13219
13221 for (unsigned i = 0; i < Mask.size(); i++)
13222 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13223 assert((Mask.size() == 8 || Mask.size() == 16) &&
13224 "Expected a v8i8 or v16i8 Mask");
13225 TBLOperands.push_back(DAG.getBuildVector(
13226 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13227
13228 SDValue Shuffle =
13230 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13231 return DAG.getBitcast(VT, Shuffle);
13232 }
13233
13234 if (Sources.size() > 2) {
13235 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13236 << "sensible when at most two source vectors are "
13237 << "involved\n");
13238 return SDValue();
13239 }
13240
13241 // Find out the smallest element size among result and two sources, and use
13242 // it as element size to build the shuffle_vector.
13243 EVT SmallestEltTy = VT.getVectorElementType();
13244 for (auto &Source : Sources) {
13245 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13246 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13247 SmallestEltTy = SrcEltTy;
13248 }
13249 }
13250 unsigned ResMultiplier =
13251 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13252 uint64_t VTSize = VT.getFixedSizeInBits();
13253 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13254 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13255
13256 // If the source vector is too wide or too narrow, we may nevertheless be able
13257 // to construct a compatible shuffle either by concatenating it with UNDEF or
13258 // extracting a suitable range of elements.
13259 for (auto &Src : Sources) {
13260 EVT SrcVT = Src.ShuffleVec.getValueType();
13261
13262 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13263 if (SrcVTSize == TypeSize::getFixed(VTSize))
13264 continue;
13265
13266 // This stage of the search produces a source with the same element type as
13267 // the original, but with a total width matching the BUILD_VECTOR output.
13268 EVT EltVT = SrcVT.getVectorElementType();
13269 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13270 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13271
13272 if (SrcVTSize.getFixedValue() < VTSize) {
13273 assert(2 * SrcVTSize == VTSize);
13274 // We can pad out the smaller vector for free, so if it's part of a
13275 // shuffle...
13276 Src.ShuffleVec =
13277 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13278 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13279 continue;
13280 }
13281
13282 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13283 LLVM_DEBUG(
13284 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13285 return SDValue();
13286 }
13287
13288 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13289 LLVM_DEBUG(
13290 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13291 return SDValue();
13292 }
13293
13294 if (Src.MinElt >= NumSrcElts) {
13295 // The extraction can just take the second half
13296 Src.ShuffleVec =
13297 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13298 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13299 Src.WindowBase = -NumSrcElts;
13300 } else if (Src.MaxElt < NumSrcElts) {
13301 // The extraction can just take the first half
13302 Src.ShuffleVec =
13303 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13304 DAG.getConstant(0, DL, MVT::i64));
13305 } else {
13306 // An actual VEXT is needed
13307 SDValue VEXTSrc1 =
13308 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13309 DAG.getConstant(0, DL, MVT::i64));
13310 SDValue VEXTSrc2 =
13311 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13312 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13313 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13314
13315 if (!SrcVT.is64BitVector()) {
13316 LLVM_DEBUG(
13317 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13318 "for SVE vectors.");
13319 return SDValue();
13320 }
13321
13322 Src.ShuffleVec =
13323 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13324 DAG.getConstant(Imm, DL, MVT::i32));
13325 Src.WindowBase = -Src.MinElt;
13326 }
13327 }
13328
13329 // Another possible incompatibility occurs from the vector element types. We
13330 // can fix this by bitcasting the source vectors to the same type we intend
13331 // for the shuffle.
13332 for (auto &Src : Sources) {
13333 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13334 if (SrcEltTy == SmallestEltTy)
13335 continue;
13336 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13337 if (DAG.getDataLayout().isBigEndian()) {
13338 Src.ShuffleVec =
13339 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13340 } else {
13341 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13342 }
13343 Src.WindowScale =
13344 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13345 Src.WindowBase *= Src.WindowScale;
13346 }
13347
13348 // Final check before we try to actually produce a shuffle.
13349 LLVM_DEBUG({
13350 for (auto Src : Sources)
13351 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13352 });
13353
13354 // The stars all align, our next step is to produce the mask for the shuffle.
13355 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13356 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13357 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13358 SDValue Entry = Op.getOperand(i);
13359 if (Entry.isUndef())
13360 continue;
13361
13362 auto Src = find(Sources, Entry.getOperand(0));
13363 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13364
13365 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13366 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13367 // segment.
13368 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13369 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13370 VT.getScalarSizeInBits());
13371 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13372
13373 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13374 // starting at the appropriate offset.
13375 int *LaneMask = &Mask[i * ResMultiplier];
13376
13377 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13378 ExtractBase += NumElts * (Src - Sources.begin());
13379 for (int j = 0; j < LanesDefined; ++j)
13380 LaneMask[j] = ExtractBase + j;
13381 }
13382
13383 // Final check before we try to produce nonsense...
13384 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13385 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13386 return SDValue();
13387 }
13388
13389 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13390 for (unsigned i = 0; i < Sources.size(); ++i)
13391 ShuffleOps[i] = Sources[i].ShuffleVec;
13392
13393 SDValue Shuffle =
13394 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13395 SDValue V;
13396 if (DAG.getDataLayout().isBigEndian()) {
13397 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13398 } else {
13399 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13400 }
13401
13402 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13403 dbgs() << "Reshuffle, creating node: "; V.dump(););
13404
13405 return V;
13406}
13407
13408// check if an EXT instruction can handle the shuffle mask when the
13409// vector sources of the shuffle are the same.
13410static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13411 unsigned NumElts = VT.getVectorNumElements();
13412
13413 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13414 if (M[0] < 0)
13415 return false;
13416
13417 Imm = M[0];
13418
13419 // If this is a VEXT shuffle, the immediate value is the index of the first
13420 // element. The other shuffle indices must be the successive elements after
13421 // the first one.
13422 unsigned ExpectedElt = Imm;
13423 for (unsigned i = 1; i < NumElts; ++i) {
13424 // Increment the expected index. If it wraps around, just follow it
13425 // back to index zero and keep going.
13426 ++ExpectedElt;
13427 if (ExpectedElt == NumElts)
13428 ExpectedElt = 0;
13429
13430 if (M[i] < 0)
13431 continue; // ignore UNDEF indices
13432 if (ExpectedElt != static_cast<unsigned>(M[i]))
13433 return false;
13434 }
13435
13436 return true;
13437}
13438
13439// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13440// v4i32s. This is really a truncate, which we can construct out of (legal)
13441// concats and truncate nodes.
13443 if (V.getValueType() != MVT::v16i8)
13444 return SDValue();
13445 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13446
13447 for (unsigned X = 0; X < 4; X++) {
13448 // Check the first item in each group is an extract from lane 0 of a v4i32
13449 // or v4i16.
13450 SDValue BaseExt = V.getOperand(X * 4);
13451 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13452 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13453 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13454 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13455 BaseExt.getConstantOperandVal(1) != 0)
13456 return SDValue();
13457 SDValue Base = BaseExt.getOperand(0);
13458 // And check the other items are extracts from the same vector.
13459 for (unsigned Y = 1; Y < 4; Y++) {
13460 SDValue Ext = V.getOperand(X * 4 + Y);
13461 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13462 Ext.getOperand(0) != Base ||
13463 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13464 Ext.getConstantOperandVal(1) != Y)
13465 return SDValue();
13466 }
13467 }
13468
13469 // Turn the buildvector into a series of truncates and concates, which will
13470 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13471 // concat together to produce 2 v8i16. These are both truncated and concat
13472 // together.
13473 SDLoc DL(V);
13474 SDValue Trunc[4] = {
13475 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13476 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13477 for (SDValue &V : Trunc)
13478 if (V.getValueType() == MVT::v4i32)
13479 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13480 SDValue Concat0 =
13481 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13482 SDValue Concat1 =
13483 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13484 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13485 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13486 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13487}
13488
13489/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13490/// element width than the vector lane type. If that is the case the function
13491/// returns true and writes the value of the DUP instruction lane operand into
13492/// DupLaneOp
13493static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13494 unsigned &DupLaneOp) {
13495 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13496 "Only possible block sizes for wide DUP are: 16, 32, 64");
13497
13498 if (BlockSize <= VT.getScalarSizeInBits())
13499 return false;
13500 if (BlockSize % VT.getScalarSizeInBits() != 0)
13501 return false;
13502 if (VT.getSizeInBits() % BlockSize != 0)
13503 return false;
13504
13505 size_t SingleVecNumElements = VT.getVectorNumElements();
13506 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13507 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13508
13509 // We are looking for masks like
13510 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13511 // might be replaced by 'undefined'. BlockIndices will eventually contain
13512 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13513 // for the above examples)
13514 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13515 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13516 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13517 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13518 if (Elt < 0)
13519 continue;
13520 // For now we don't support shuffles that use the second operand
13521 if ((unsigned)Elt >= SingleVecNumElements)
13522 return false;
13523 if (BlockElts[I] < 0)
13524 BlockElts[I] = Elt;
13525 else if (BlockElts[I] != Elt)
13526 return false;
13527 }
13528
13529 // We found a candidate block (possibly with some undefs). It must be a
13530 // sequence of consecutive integers starting with a value divisible by
13531 // NumEltsPerBlock with some values possibly replaced by undef-s.
13532
13533 // Find first non-undef element
13534 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13535 assert(FirstRealEltIter != BlockElts.end() &&
13536 "Shuffle with all-undefs must have been caught by previous cases, "
13537 "e.g. isSplat()");
13538 if (FirstRealEltIter == BlockElts.end()) {
13539 DupLaneOp = 0;
13540 return true;
13541 }
13542
13543 // Index of FirstRealElt in BlockElts
13544 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13545
13546 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13547 return false;
13548 // BlockElts[0] must have the following value if it isn't undef:
13549 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13550
13551 // Check the first element
13552 if (Elt0 % NumEltsPerBlock != 0)
13553 return false;
13554 // Check that the sequence indeed consists of consecutive integers (modulo
13555 // undefs)
13556 for (size_t I = 0; I < NumEltsPerBlock; I++)
13557 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13558 return false;
13559
13560 DupLaneOp = Elt0 / NumEltsPerBlock;
13561 return true;
13562}
13563
13564// check if an EXT instruction can handle the shuffle mask when the
13565// vector sources of the shuffle are different.
13566static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13567 unsigned &Imm) {
13568 // Look for the first non-undef element.
13569 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13570
13571 // Benefit from APInt to handle overflow when calculating expected element.
13572 unsigned NumElts = VT.getVectorNumElements();
13573 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13574 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13575 /*implicitTrunc=*/true);
13576 // The following shuffle indices must be the successive elements after the
13577 // first real element.
13578 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13579 return Elt != ExpectedElt++ && Elt >= 0;
13580 });
13581 if (FoundWrongElt)
13582 return false;
13583
13584 // The index of an EXT is the first element if it is not UNDEF.
13585 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13586 // value of the first element. E.g.
13587 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13588 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13589 // ExpectedElt is the last mask index plus 1.
13590 Imm = ExpectedElt.getZExtValue();
13591
13592 // There are two difference cases requiring to reverse input vectors.
13593 // For example, for vector <4 x i32> we have the following cases,
13594 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13595 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13596 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13597 // to reverse two input vectors.
13598 if (Imm < NumElts)
13599 ReverseEXT = true;
13600 else
13601 Imm -= NumElts;
13602
13603 return true;
13604}
13605
13606/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13607/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13608/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13609static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13610 unsigned NumElts = VT.getVectorNumElements();
13611 if (NumElts % 2 != 0)
13612 return false;
13613 WhichResult = (M[0] == 0 ? 0 : 1);
13614 unsigned Idx = WhichResult * NumElts / 2;
13615 for (unsigned i = 0; i != NumElts; i += 2) {
13616 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13617 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13618 return false;
13619 Idx += 1;
13620 }
13621
13622 return true;
13623}
13624
13625/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13626/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13627/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13628static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13629 unsigned Half = VT.getVectorNumElements() / 2;
13630 WhichResult = (M[0] == 0 ? 0 : 1);
13631 for (unsigned j = 0; j != 2; ++j) {
13632 unsigned Idx = WhichResult;
13633 for (unsigned i = 0; i != Half; ++i) {
13634 int MIdx = M[i + j * Half];
13635 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13636 return false;
13637 Idx += 2;
13638 }
13639 }
13640
13641 return true;
13642}
13643
13644/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13645/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13646/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13647static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13648 unsigned NumElts = VT.getVectorNumElements();
13649 if (NumElts % 2 != 0)
13650 return false;
13651 WhichResult = (M[0] == 0 ? 0 : 1);
13652 for (unsigned i = 0; i < NumElts; i += 2) {
13653 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13654 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13655 return false;
13656 }
13657 return true;
13658}
13659
13660static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13661 bool &DstIsLeft, int &Anomaly) {
13662 if (M.size() != static_cast<size_t>(NumInputElements))
13663 return false;
13664
13665 int NumLHSMatch = 0, NumRHSMatch = 0;
13666 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13667
13668 for (int i = 0; i < NumInputElements; ++i) {
13669 if (M[i] == -1) {
13670 ++NumLHSMatch;
13671 ++NumRHSMatch;
13672 continue;
13673 }
13674
13675 if (M[i] == i)
13676 ++NumLHSMatch;
13677 else
13678 LastLHSMismatch = i;
13679
13680 if (M[i] == i + NumInputElements)
13681 ++NumRHSMatch;
13682 else
13683 LastRHSMismatch = i;
13684 }
13685
13686 if (NumLHSMatch == NumInputElements - 1) {
13687 DstIsLeft = true;
13688 Anomaly = LastLHSMismatch;
13689 return true;
13690 } else if (NumRHSMatch == NumInputElements - 1) {
13691 DstIsLeft = false;
13692 Anomaly = LastRHSMismatch;
13693 return true;
13694 }
13695
13696 return false;
13697}
13698
13699static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13700 if (VT.getSizeInBits() != 128)
13701 return false;
13702
13703 unsigned NumElts = VT.getVectorNumElements();
13704
13705 for (int I = 0, E = NumElts / 2; I != E; I++) {
13706 if (Mask[I] != I)
13707 return false;
13708 }
13709
13710 int Offset = NumElts / 2;
13711 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13712 if (Mask[I] != I + SplitLHS * Offset)
13713 return false;
13714 }
13715
13716 return true;
13717}
13718
13720 SDLoc DL(Op);
13721 EVT VT = Op.getValueType();
13722 SDValue V0 = Op.getOperand(0);
13723 SDValue V1 = Op.getOperand(1);
13724 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13725
13728 return SDValue();
13729
13730 bool SplitV0 = V0.getValueSizeInBits() == 128;
13731
13732 if (!isConcatMask(Mask, VT, SplitV0))
13733 return SDValue();
13734
13735 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13736 if (SplitV0) {
13737 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13738 DAG.getConstant(0, DL, MVT::i64));
13739 }
13740 if (V1.getValueSizeInBits() == 128) {
13741 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13742 DAG.getConstant(0, DL, MVT::i64));
13743 }
13744 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13745}
13746
13747/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13748/// the specified operations to build the shuffle. ID is the perfect-shuffle
13749//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13750//table entry and LHS/RHS are the immediate inputs for this stage of the
13751//shuffle.
13753 unsigned PFEntry, SDValue LHS,
13754 SDValue RHS, SelectionDAG &DAG,
13755 const SDLoc &DL) {
13756 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13757 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13758 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13759
13760 enum {
13761 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13762 OP_VREV,
13763 OP_VDUP0,
13764 OP_VDUP1,
13765 OP_VDUP2,
13766 OP_VDUP3,
13767 OP_VEXT1,
13768 OP_VEXT2,
13769 OP_VEXT3,
13770 OP_VUZPL, // VUZP, left result
13771 OP_VUZPR, // VUZP, right result
13772 OP_VZIPL, // VZIP, left result
13773 OP_VZIPR, // VZIP, right result
13774 OP_VTRNL, // VTRN, left result
13775 OP_VTRNR, // VTRN, right result
13776 OP_MOVLANE // Move lane. RHSID is the lane to move into
13777 };
13778
13779 if (OpNum == OP_COPY) {
13780 if (LHSID == (1 * 9 + 2) * 9 + 3)
13781 return LHS;
13782 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13783 return RHS;
13784 }
13785
13786 if (OpNum == OP_MOVLANE) {
13787 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13788 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13789 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13790 Elt = 3 - Elt;
13791 while (Elt > 0) {
13792 ID /= 9;
13793 Elt--;
13794 }
13795 return (ID % 9 == 8) ? -1 : ID % 9;
13796 };
13797
13798 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13799 // get the lane to move from the PFID, which is always from the
13800 // original vectors (V1 or V2).
13802 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
13803 EVT VT = OpLHS.getValueType();
13804 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13805 unsigned ExtLane = 0;
13806 SDValue Input;
13807
13808 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13809 // convert into a higher type.
13810 if (RHSID & 0x4) {
13811 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13812 if (MaskElt == -1)
13813 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13814 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13815 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13816 Input = MaskElt < 2 ? V1 : V2;
13817 if (VT.getScalarSizeInBits() == 16) {
13818 Input = DAG.getBitcast(MVT::v2f32, Input);
13819 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13820 } else {
13821 assert(VT.getScalarSizeInBits() == 32 &&
13822 "Expected 16 or 32 bit shuffle elements");
13823 Input = DAG.getBitcast(MVT::v2f64, Input);
13824 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13825 }
13826 } else {
13827 int MaskElt = getPFIDLane(ID, RHSID);
13828 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13829 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13830 Input = MaskElt < 4 ? V1 : V2;
13831 // Be careful about creating illegal types. Use f16 instead of i16.
13832 if (VT == MVT::v4i16) {
13833 Input = DAG.getBitcast(MVT::v4f16, Input);
13834 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13835 }
13836 }
13839 Input, DAG.getVectorIdxConstant(ExtLane, DL));
13840 SDValue Ins =
13841 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
13842 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
13843 return DAG.getBitcast(VT, Ins);
13844 }
13845
13846 SDValue OpLHS, OpRHS;
13847 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13848 RHS, DAG, DL);
13849 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13850 RHS, DAG, DL);
13851 EVT VT = OpLHS.getValueType();
13852
13853 switch (OpNum) {
13854 default:
13855 llvm_unreachable("Unknown shuffle opcode!");
13856 case OP_VREV:
13857 // VREV divides the vector in half and swaps within the half.
13858 if (VT.getVectorElementType() == MVT::i32 ||
13859 VT.getVectorElementType() == MVT::f32)
13860 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
13861 // vrev <4 x i16> -> REV32
13862 if (VT.getVectorElementType() == MVT::i16 ||
13863 VT.getVectorElementType() == MVT::f16 ||
13864 VT.getVectorElementType() == MVT::bf16)
13865 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
13866 // vrev <4 x i8> -> REV16
13867 assert(VT.getVectorElementType() == MVT::i8);
13868 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
13869 case OP_VDUP0:
13870 case OP_VDUP1:
13871 case OP_VDUP2:
13872 case OP_VDUP3: {
13873 EVT EltTy = VT.getVectorElementType();
13874 unsigned Opcode;
13875 if (EltTy == MVT::i8)
13876 Opcode = AArch64ISD::DUPLANE8;
13877 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13878 Opcode = AArch64ISD::DUPLANE16;
13879 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13880 Opcode = AArch64ISD::DUPLANE32;
13881 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13882 Opcode = AArch64ISD::DUPLANE64;
13883 else
13884 llvm_unreachable("Invalid vector element type?");
13885
13886 if (VT.getSizeInBits() == 64)
13887 OpLHS = WidenVector(OpLHS, DAG);
13888 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
13889 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
13890 }
13891 case OP_VEXT1:
13892 case OP_VEXT2:
13893 case OP_VEXT3: {
13894 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
13895 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
13896 DAG.getConstant(Imm, DL, MVT::i32));
13897 }
13898 case OP_VUZPL:
13899 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
13900 case OP_VUZPR:
13901 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
13902 case OP_VZIPL:
13903 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
13904 case OP_VZIPR:
13905 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
13906 case OP_VTRNL:
13907 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
13908 case OP_VTRNR:
13909 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
13910 }
13911}
13912
13914 SelectionDAG &DAG) {
13915 // Check to see if we can use the TBL instruction.
13916 SDValue V1 = Op.getOperand(0);
13917 SDValue V2 = Op.getOperand(1);
13918 SDLoc DL(Op);
13919
13920 EVT EltVT = Op.getValueType().getVectorElementType();
13921 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13922
13923 bool Swap = false;
13924 if (V1.isUndef() || isZerosVector(V1.getNode())) {
13925 std::swap(V1, V2);
13926 Swap = true;
13927 }
13928
13929 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13930 // out of range values with 0s. We do need to make sure that any out-of-range
13931 // values are really out-of-range for a v16i8 vector.
13932 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
13933 MVT IndexVT = MVT::v8i8;
13934 unsigned IndexLen = 8;
13935 if (Op.getValueSizeInBits() == 128) {
13936 IndexVT = MVT::v16i8;
13937 IndexLen = 16;
13938 }
13939
13941 for (int Val : ShuffleMask) {
13942 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13943 unsigned Offset = Byte + Val * BytesPerElt;
13944 if (Swap)
13945 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13946 if (IsUndefOrZero && Offset >= IndexLen)
13947 Offset = 255;
13948 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
13949 }
13950 }
13951
13952 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
13953 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
13954
13955 SDValue Shuffle;
13956 if (IsUndefOrZero) {
13957 if (IndexLen == 8)
13958 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
13959 Shuffle = DAG.getNode(
13960 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13961 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13962 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13963 } else {
13964 if (IndexLen == 8) {
13965 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
13966 Shuffle = DAG.getNode(
13967 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13968 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13969 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13970 } else {
13971 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13972 // cannot currently represent the register constraints on the input
13973 // table registers.
13974 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13975 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13976 // IndexLen));
13977 Shuffle = DAG.getNode(
13978 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13979 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
13980 V2Cst,
13981 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13982 }
13983 }
13984 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
13985}
13986
13987static unsigned getDUPLANEOp(EVT EltType) {
13988 if (EltType == MVT::i8)
13989 return AArch64ISD::DUPLANE8;
13990 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13991 return AArch64ISD::DUPLANE16;
13992 if (EltType == MVT::i32 || EltType == MVT::f32)
13993 return AArch64ISD::DUPLANE32;
13994 if (EltType == MVT::i64 || EltType == MVT::f64)
13995 return AArch64ISD::DUPLANE64;
13996
13997 llvm_unreachable("Invalid vector element type?");
13998}
13999
14000static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14001 unsigned Opcode, SelectionDAG &DAG) {
14002 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14003 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14004 // Match: dup (bitcast (extract_subv X, C)), LaneC
14005 if (BitCast.getOpcode() != ISD::BITCAST ||
14007 return false;
14008
14009 // The extract index must align in the destination type. That may not
14010 // happen if the bitcast is from narrow to wide type.
14011 SDValue Extract = BitCast.getOperand(0);
14012 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14013 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14014 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14015 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14016 if (ExtIdxInBits % CastedEltBitWidth != 0)
14017 return false;
14018
14019 // Can't handle cases where vector size is not 128-bit
14020 if (!Extract.getOperand(0).getValueType().is128BitVector())
14021 return false;
14022
14023 // Update the lane value by offsetting with the scaled extract index.
14024 LaneC += ExtIdxInBits / CastedEltBitWidth;
14025
14026 // Determine the casted vector type of the wide vector input.
14027 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14028 // Examples:
14029 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14030 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14031 unsigned SrcVecNumElts =
14032 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14034 SrcVecNumElts);
14035 return true;
14036 };
14037 MVT CastVT;
14038 if (getScaledOffsetDup(V, Lane, CastVT)) {
14039 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14040 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14041 V.getOperand(0).getValueType().is128BitVector()) {
14042 // The lane is incremented by the index of the extract.
14043 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14044 Lane += V.getConstantOperandVal(1);
14045 V = V.getOperand(0);
14046 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14047 // The lane is decremented if we are splatting from the 2nd operand.
14048 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14049 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14050 Lane -= Idx * VT.getVectorNumElements() / 2;
14051 V = WidenVector(V.getOperand(Idx), DAG);
14052 } else if (VT.getSizeInBits() == 64) {
14053 // Widen the operand to 128-bit register with undef.
14054 V = WidenVector(V, DAG);
14055 }
14056 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14057}
14058
14059// Try to widen element type to get a new mask value for a better permutation
14060// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14061// UZP1/2, TRN1/2, REV, INS, etc.
14062// For example:
14063// shufflevector <4 x i32> %a, <4 x i32> %b,
14064// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14065// is equivalent to:
14066// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14067// Finally, we can get:
14068// mov v0.d[0], v1.d[1]
14070 SDLoc DL(Op);
14071 EVT VT = Op.getValueType();
14072 EVT ScalarVT = VT.getVectorElementType();
14073 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14074 SDValue V0 = Op.getOperand(0);
14075 SDValue V1 = Op.getOperand(1);
14076 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14077
14078 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14079 // We need to make sure the wider element type is legal. Thus, ElementSize
14080 // should be not larger than 32 bits, and i1 type should also be excluded.
14081 if (ElementSize > 32 || ElementSize == 1)
14082 return SDValue();
14083
14084 SmallVector<int, 8> NewMask;
14085 if (widenShuffleMaskElts(Mask, NewMask)) {
14086 MVT NewEltVT = VT.isFloatingPoint()
14087 ? MVT::getFloatingPointVT(ElementSize * 2)
14088 : MVT::getIntegerVT(ElementSize * 2);
14089 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14090 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14091 V0 = DAG.getBitcast(NewVT, V0);
14092 V1 = DAG.getBitcast(NewVT, V1);
14093 return DAG.getBitcast(VT,
14094 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14095 }
14096 }
14097
14098 return SDValue();
14099}
14100
14101// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14103 ArrayRef<int> ShuffleMask,
14104 SelectionDAG &DAG) {
14105 SDValue Tbl1 = Op->getOperand(0);
14106 SDValue Tbl2 = Op->getOperand(1);
14107 SDLoc DL(Op);
14108 SDValue Tbl2ID =
14109 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14110
14111 EVT VT = Op.getValueType();
14112 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14113 Tbl1.getOperand(0) != Tbl2ID ||
14115 Tbl2.getOperand(0) != Tbl2ID)
14116 return SDValue();
14117
14118 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14119 return SDValue();
14120
14121 SDValue Mask1 = Tbl1.getOperand(3);
14122 SDValue Mask2 = Tbl2.getOperand(3);
14123 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14124 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14125 return SDValue();
14126
14127 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14128 for (unsigned I = 0; I < 16; I++) {
14129 if (ShuffleMask[I] < 16)
14130 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14131 else {
14132 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14133 if (!C)
14134 return SDValue();
14135 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14136 }
14137 }
14138
14139 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14140 SDValue ID =
14141 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14142
14143 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14144 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14145 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14146}
14147
14148// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14149// but we don't have an appropriate instruction,
14150// so custom-lower it as ZIP1-with-zeros.
14151SDValue
14152AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14153 SelectionDAG &DAG) const {
14154 SDLoc DL(Op);
14155 EVT VT = Op.getValueType();
14156 SDValue SrcOp = Op.getOperand(0);
14157 EVT SrcVT = SrcOp.getValueType();
14158 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14159 "Unexpected extension factor.");
14160 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14161 // FIXME: support multi-step zipping?
14162 if (Scale != 2)
14163 return SDValue();
14164 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14165 return DAG.getBitcast(VT,
14166 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14167}
14168
14169SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14170 SelectionDAG &DAG) const {
14171 SDLoc DL(Op);
14172 EVT VT = Op.getValueType();
14173
14174 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14175
14176 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14177 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14178
14179 // Convert shuffles that are directly supported on NEON to target-specific
14180 // DAG nodes, instead of keeping them as shuffles and matching them again
14181 // during code selection. This is more efficient and avoids the possibility
14182 // of inconsistencies between legalization and selection.
14183 ArrayRef<int> ShuffleMask = SVN->getMask();
14184
14185 SDValue V1 = Op.getOperand(0);
14186 SDValue V2 = Op.getOperand(1);
14187
14188 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14189 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14190 "Unexpected VECTOR_SHUFFLE mask size!");
14191
14192 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14193 return Res;
14194
14195 if (SVN->isSplat()) {
14196 int Lane = SVN->getSplatIndex();
14197 // If this is undef splat, generate it via "just" vdup, if possible.
14198 if (Lane == -1)
14199 Lane = 0;
14200
14201 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14202 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14203 V1.getOperand(0));
14204 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14205 // constant. If so, we can just reference the lane's definition directly.
14206 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14207 !isa<ConstantSDNode>(V1.getOperand(Lane)))
14208 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14209
14210 // Otherwise, duplicate from the lane of the input vector.
14211 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14212 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14213 }
14214
14215 // Check if the mask matches a DUP for a wider element
14216 for (unsigned LaneSize : {64U, 32U, 16U}) {
14217 unsigned Lane = 0;
14218 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14219 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14220 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14221 : AArch64ISD::DUPLANE16;
14222 // Cast V1 to an integer vector with required lane size
14223 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14224 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14225 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14226 V1 = DAG.getBitcast(NewVecTy, V1);
14227 // Construct the DUP instruction
14228 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14229 // Cast back to the original type
14230 return DAG.getBitcast(VT, V1);
14231 }
14232 }
14233
14234 unsigned NumElts = VT.getVectorNumElements();
14235 unsigned EltSize = VT.getScalarSizeInBits();
14236 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14237 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14238 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14239 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14240 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14241 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14242
14243 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14244 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14245 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14246 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14247 DAG.getConstant(8, DL, MVT::i32));
14248 }
14249
14250 bool ReverseEXT = false;
14251 unsigned Imm;
14252 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14253 if (ReverseEXT)
14254 std::swap(V1, V2);
14255 Imm *= getExtFactor(V1);
14256 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14257 DAG.getConstant(Imm, DL, MVT::i32));
14258 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14259 Imm *= getExtFactor(V1);
14260 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14261 DAG.getConstant(Imm, DL, MVT::i32));
14262 }
14263
14264 unsigned WhichResult;
14265 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14266 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14267 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14268 }
14269 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14270 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14271 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14272 }
14273 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14274 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14275 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14276 }
14277
14278 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14279 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14280 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14281 }
14282 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14283 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14284 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14285 }
14286 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14287 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14288 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14289 }
14290
14292 return Concat;
14293
14294 bool DstIsLeft;
14295 int Anomaly;
14296 int NumInputElements = V1.getValueType().getVectorNumElements();
14297 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14298 SDValue DstVec = DstIsLeft ? V1 : V2;
14299 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14300
14301 SDValue SrcVec = V1;
14302 int SrcLane = ShuffleMask[Anomaly];
14303 if (SrcLane >= NumInputElements) {
14304 SrcVec = V2;
14305 SrcLane -= NumElts;
14306 }
14307 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14308
14309 EVT ScalarVT = VT.getVectorElementType();
14310
14311 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14312 ScalarVT = MVT::i32;
14313
14314 return DAG.getNode(
14315 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14316 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14317 DstLaneV);
14318 }
14319
14320 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14321 return NewSD;
14322
14323 // If the shuffle is not directly supported and it has 4 elements, use
14324 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14325 if (NumElts == 4) {
14326 unsigned PFIndexes[4];
14327 for (unsigned i = 0; i != 4; ++i) {
14328 if (ShuffleMask[i] < 0)
14329 PFIndexes[i] = 8;
14330 else
14331 PFIndexes[i] = ShuffleMask[i];
14332 }
14333
14334 // Compute the index in the perfect shuffle table.
14335 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14336 PFIndexes[2] * 9 + PFIndexes[3];
14337 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14338 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14339 DL);
14340 }
14341
14342 // Check for a "select shuffle", generating a BSL to pick between lanes in
14343 // V1/V2.
14344 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14345 assert(VT.getScalarSizeInBits() <= 32 &&
14346 "Expected larger vector element sizes to be handled already");
14347 SmallVector<SDValue> MaskElts;
14348 for (int M : ShuffleMask)
14349 MaskElts.push_back(DAG.getConstant(
14350 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14352 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14353 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14354 DAG.getBitcast(IVT, V1),
14355 DAG.getBitcast(IVT, V2)));
14356 }
14357
14358 // Fall back to generating a TBL
14359 return GenerateTBL(Op, ShuffleMask, DAG);
14360}
14361
14362SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14363 SelectionDAG &DAG) const {
14364 EVT VT = Op.getValueType();
14365
14366 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14367 return LowerToScalableOp(Op, DAG);
14368
14369 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14370 "Unexpected vector type!");
14371
14372 // We can handle the constant cases during isel.
14373 if (isa<ConstantSDNode>(Op.getOperand(0)))
14374 return Op;
14375
14376 // There isn't a natural way to handle the general i1 case, so we use some
14377 // trickery with whilelo.
14378 SDLoc DL(Op);
14379 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14380 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14381 DAG.getValueType(MVT::i1));
14382 SDValue ID =
14383 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14384 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14385 if (VT == MVT::nxv1i1)
14386 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14387 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14388 Zero, SplatVal),
14389 Zero);
14390 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14391}
14392
14393SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14394 SelectionDAG &DAG) const {
14395 SDLoc DL(Op);
14396
14397 EVT VT = Op.getValueType();
14398 if (!isTypeLegal(VT) || !VT.isScalableVector())
14399 return SDValue();
14400
14401 // Current lowering only supports the SVE-ACLE types.
14403 return SDValue();
14404
14405 // The DUPQ operation is independent of element type so normalise to i64s.
14406 SDValue Idx128 = Op.getOperand(2);
14407
14408 // DUPQ can be used when idx is in range.
14409 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14410 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14411 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14412 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14413 }
14414
14415 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14416
14417 // The ACLE says this must produce the same result as:
14418 // svtbl(data, svadd_x(svptrue_b64(),
14419 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14420 // index * 2))
14421 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14422 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14423
14424 // create the vector 0,1,0,1,...
14425 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14426 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14427
14428 // create the vector idx64,idx64+1,idx64,idx64+1,...
14429 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14430 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14431 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14432
14433 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14434 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14435 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14436}
14437
14438
14439static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14440 APInt &UndefBits) {
14441 EVT VT = BVN->getValueType(0);
14442 APInt SplatBits, SplatUndef;
14443 unsigned SplatBitSize;
14444 bool HasAnyUndefs;
14445 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14446 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14447
14448 for (unsigned i = 0; i < NumSplats; ++i) {
14449 CnstBits <<= SplatBitSize;
14450 UndefBits <<= SplatBitSize;
14451 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14452 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14453 }
14454
14455 return true;
14456 }
14457
14458 return false;
14459}
14460
14461// Try 64-bit splatted SIMD immediate.
14462static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14463 const APInt &Bits) {
14464 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14465 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14466 EVT VT = Op.getValueType();
14467 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14468
14471
14472 SDLoc DL(Op);
14473 SDValue Mov =
14474 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14475 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14476 }
14477 }
14478
14479 return SDValue();
14480}
14481
14482// Try 32-bit splatted SIMD immediate.
14483static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14484 const APInt &Bits,
14485 const SDValue *LHS = nullptr) {
14486 EVT VT = Op.getValueType();
14487 if (VT.isFixedLengthVector() &&
14489 return SDValue();
14490
14491 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14492 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14493 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14494 bool isAdvSIMDModImm = false;
14495 uint64_t Shift;
14496
14497 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14499 Shift = 0;
14500 }
14501 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14503 Shift = 8;
14504 }
14505 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14507 Shift = 16;
14508 }
14509 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14511 Shift = 24;
14512 }
14513
14514 if (isAdvSIMDModImm) {
14515 SDLoc DL(Op);
14516 SDValue Mov;
14517
14518 if (LHS)
14519 Mov = DAG.getNode(NewOp, DL, MovTy,
14520 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14521 DAG.getConstant(Value, DL, MVT::i32),
14522 DAG.getConstant(Shift, DL, MVT::i32));
14523 else
14524 Mov =
14525 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14526 DAG.getConstant(Shift, DL, MVT::i32));
14527
14528 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14529 }
14530 }
14531
14532 return SDValue();
14533}
14534
14535// Try 16-bit splatted SIMD immediate.
14536static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14537 const APInt &Bits,
14538 const SDValue *LHS = nullptr) {
14539 EVT VT = Op.getValueType();
14540 if (VT.isFixedLengthVector() &&
14542 return SDValue();
14543
14544 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14545 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14546 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14547 bool isAdvSIMDModImm = false;
14548 uint64_t Shift;
14549
14550 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14552 Shift = 0;
14553 }
14554 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14556 Shift = 8;
14557 }
14558
14559 if (isAdvSIMDModImm) {
14560 SDLoc DL(Op);
14561 SDValue Mov;
14562
14563 if (LHS)
14564 Mov = DAG.getNode(NewOp, DL, MovTy,
14565 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14566 DAG.getConstant(Value, DL, MVT::i32),
14567 DAG.getConstant(Shift, DL, MVT::i32));
14568 else
14569 Mov =
14570 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14571 DAG.getConstant(Shift, DL, MVT::i32));
14572
14573 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14574 }
14575 }
14576
14577 return SDValue();
14578}
14579
14580// Try 32-bit splatted SIMD immediate with shifted ones.
14582 SelectionDAG &DAG, const APInt &Bits) {
14583 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14584 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14585 EVT VT = Op.getValueType();
14586 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14587 bool isAdvSIMDModImm = false;
14588 uint64_t Shift;
14589
14590 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14592 Shift = 264;
14593 }
14594 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14596 Shift = 272;
14597 }
14598
14599 if (isAdvSIMDModImm) {
14600 SDLoc DL(Op);
14601 SDValue Mov =
14602 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14603 DAG.getConstant(Shift, DL, MVT::i32));
14604 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14605 }
14606 }
14607
14608 return SDValue();
14609}
14610
14611// Try 8-bit splatted SIMD immediate.
14612static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14613 const APInt &Bits) {
14614 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14615 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14616 EVT VT = Op.getValueType();
14617 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14618
14621
14622 SDLoc DL(Op);
14623 SDValue Mov =
14624 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14625 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14626 }
14627 }
14628
14629 return SDValue();
14630}
14631
14632// Try FP splatted SIMD immediate.
14633static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14634 const APInt &Bits) {
14635 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14636 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14637 EVT VT = Op.getValueType();
14638 bool isWide = (VT.getSizeInBits() == 128);
14639 MVT MovTy;
14640 bool isAdvSIMDModImm = false;
14641
14642 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14644 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14645 }
14646 else if (isWide &&
14647 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14649 MovTy = MVT::v2f64;
14650 }
14651
14652 if (isAdvSIMDModImm) {
14653 SDLoc DL(Op);
14654 SDValue Mov =
14655 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14656 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14657 }
14658 }
14659
14660 return SDValue();
14661}
14662
14663// Specialized code to quickly find if PotentialBVec is a BuildVector that
14664// consists of only the same constant int value, returned in reference arg
14665// ConstVal
14666static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14667 uint64_t &ConstVal) {
14668 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14669 if (!Bvec)
14670 return false;
14671 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14672 if (!FirstElt)
14673 return false;
14674 EVT VT = Bvec->getValueType(0);
14675 unsigned NumElts = VT.getVectorNumElements();
14676 for (unsigned i = 1; i < NumElts; ++i)
14677 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14678 return false;
14679 ConstVal = FirstElt->getZExtValue();
14680 return true;
14681}
14682
14684 // Look through cast.
14685 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14686 N = N.getOperand(0);
14687
14688 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14689}
14690
14692 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14693
14694 // Look through cast.
14695 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14696 N = N.getOperand(0);
14697 // When reinterpreting from a type with fewer elements the "new" elements
14698 // are not active, so bail if they're likely to be used.
14699 if (N.getValueType().getVectorMinNumElements() < NumElts)
14700 return false;
14701 }
14702
14703 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14704 return true;
14705
14706 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14707 // or smaller than the implicit element type represented by N.
14708 // NOTE: A larger element count implies a smaller element type.
14709 if (N.getOpcode() == AArch64ISD::PTRUE &&
14710 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14711 return N.getValueType().getVectorMinNumElements() >= NumElts;
14712
14713 // If we're compiling for a specific vector-length, we can check if the
14714 // pattern's VL equals that of the scalable vector at runtime.
14715 if (N.getOpcode() == AArch64ISD::PTRUE) {
14716 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14717 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14718 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14719 if (MaxSVESize && MinSVESize == MaxSVESize) {
14720 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14721 unsigned PatNumElts =
14722 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14723 return PatNumElts == (NumElts * VScale);
14724 }
14725 }
14726
14727 return false;
14728}
14729
14730// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14731// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14732// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14733// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14734// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14735// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14737 EVT VT = N->getValueType(0);
14738
14739 if (!VT.isVector())
14740 return SDValue();
14741
14742 SDLoc DL(N);
14743
14744 SDValue And;
14745 SDValue Shift;
14746
14747 SDValue FirstOp = N->getOperand(0);
14748 unsigned FirstOpc = FirstOp.getOpcode();
14749 SDValue SecondOp = N->getOperand(1);
14750 unsigned SecondOpc = SecondOp.getOpcode();
14751
14752 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14753 // a BICi in order to use an immediate instead of a register.
14754 // Is the other operand an shl or lshr? This will have been turned into:
14755 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14756 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14757 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14758 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14759 SecondOpc == AArch64ISD::SHL_PRED ||
14760 SecondOpc == AArch64ISD::SRL_PRED)) {
14761 And = FirstOp;
14762 Shift = SecondOp;
14763
14764 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14765 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14766 FirstOpc == AArch64ISD::SHL_PRED ||
14767 FirstOpc == AArch64ISD::SRL_PRED)) {
14768 And = SecondOp;
14769 Shift = FirstOp;
14770 } else
14771 return SDValue();
14772
14773 bool IsAnd = And.getOpcode() == ISD::AND;
14774 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14775 Shift.getOpcode() == AArch64ISD::SRL_PRED;
14776 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14777 Shift.getOpcode() == AArch64ISD::SRL_PRED;
14778
14779 // Is the shift amount constant and are all lanes active?
14780 uint64_t C2;
14781 if (ShiftHasPredOp) {
14782 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14783 return SDValue();
14784 APInt C;
14786 return SDValue();
14787 C2 = C.getZExtValue();
14788 } else if (ConstantSDNode *C2node =
14789 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14790 C2 = C2node->getZExtValue();
14791 else
14792 return SDValue();
14793
14794 APInt C1AsAPInt;
14795 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14796 if (IsAnd) {
14797 // Is the and mask vector all constant?
14798 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14799 return SDValue();
14800 } else {
14801 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14802 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14803 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14804 assert(C1nodeImm && C1nodeShift);
14805 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14806 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14807 }
14808
14809 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14810 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14811 // how much one can shift elements of a particular size?
14812 if (C2 > ElemSizeInBits)
14813 return SDValue();
14814
14815 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14816 : APInt::getLowBitsSet(ElemSizeInBits, C2);
14817 if (C1AsAPInt != RequiredC1)
14818 return SDValue();
14819
14820 SDValue X = And.getOperand(0);
14821 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14822 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14823 : Shift.getOperand(1);
14824
14825 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14826 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14827
14828 return ResultSLI;
14829}
14830
14832 EVT VT = N->getValueType(0);
14833 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
14834 SDLoc DL(N);
14835 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14836
14837 if (VT.isScalableVector() && !Subtarget.hasSVE2())
14838 return SDValue();
14839
14840 SDValue N0 = N->getOperand(0);
14841 if (N0.getOpcode() != ISD::AND)
14842 return SDValue();
14843
14844 SDValue N1 = N->getOperand(1);
14845 if (N1.getOpcode() != ISD::AND)
14846 return SDValue();
14847
14848 // InstCombine does (not (neg a)) => (add a -1).
14849 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
14850 // Loop over all combinations of AND operands.
14851 for (int i = 1; i >= 0; --i) {
14852 for (int j = 1; j >= 0; --j) {
14853 SDValue O0 = N0->getOperand(i);
14854 SDValue O1 = N1->getOperand(j);
14855 SDValue Sub, Add, SubSibling, AddSibling;
14856
14857 // Find a SUB and an ADD operand, one from each AND.
14858 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
14859 Sub = O0;
14860 Add = O1;
14861 SubSibling = N0->getOperand(1 - i);
14862 AddSibling = N1->getOperand(1 - j);
14863 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
14864 Add = O0;
14865 Sub = O1;
14866 AddSibling = N0->getOperand(1 - i);
14867 SubSibling = N1->getOperand(1 - j);
14868 } else
14869 continue;
14870
14871 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
14872 continue;
14873
14874 // Constant ones is always righthand operand of the Add.
14875 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
14876 continue;
14877
14878 if (Sub.getOperand(1) != Add.getOperand(0))
14879 continue;
14880
14881 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
14882 }
14883 }
14884
14885 // (or (and a b) (and (not a) c)) => (bsl a b c)
14886 // We only have to look for constant vectors here since the general, variable
14887 // case can be handled in TableGen.
14888 unsigned Bits = VT.getScalarSizeInBits();
14889 for (int i = 1; i >= 0; --i)
14890 for (int j = 1; j >= 0; --j) {
14891 APInt Val1, Val2;
14892
14893 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
14895 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
14896 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
14897 N0->getOperand(1 - i), N1->getOperand(1 - j));
14898 }
14899 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
14900 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
14901 if (!BVN0 || !BVN1)
14902 continue;
14903
14904 bool FoundMatch = true;
14905 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
14906 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
14907 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
14908 if (!CN0 || !CN1 ||
14909 CN0->getAPIntValue().trunc(Bits) !=
14910 ~CN1->getAsAPIntVal().trunc(Bits)) {
14911 FoundMatch = false;
14912 break;
14913 }
14914 }
14915 if (FoundMatch)
14916 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
14917 N0->getOperand(1 - i), N1->getOperand(1 - j));
14918 }
14919
14920 return SDValue();
14921}
14922
14923SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14924 SelectionDAG &DAG) const {
14925 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14926 !Subtarget->isNeonAvailable()))
14927 return LowerToScalableOp(Op, DAG);
14928
14929 if (SDValue Res = tryLowerToBSL(Op, DAG))
14930 return Res;
14931
14932 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14933 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
14934 return Res;
14935
14936 EVT VT = Op.getValueType();
14937 if (VT.isScalableVector())
14938 return Op;
14939
14940 SDValue LHS = Op.getOperand(0);
14941 BuildVectorSDNode *BVN =
14942 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
14943 if (!BVN) {
14944 // OR commutes, so try swapping the operands.
14945 LHS = Op.getOperand(1);
14946 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
14947 }
14948 if (!BVN)
14949 return Op;
14950
14951 APInt DefBits(VT.getSizeInBits(), 0);
14952 APInt UndefBits(VT.getSizeInBits(), 0);
14953 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14954 SDValue NewOp;
14955
14956 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14957 DefBits, &LHS)) ||
14958 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14959 DefBits, &LHS)))
14960 return NewOp;
14961
14962 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14963 UndefBits, &LHS)) ||
14964 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14965 UndefBits, &LHS)))
14966 return NewOp;
14967 }
14968
14969 // We can always fall back to a non-immediate OR.
14970 return Op;
14971}
14972
14973// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14974// be truncated to fit element width.
14976 SelectionDAG &DAG) {
14977 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14978 SDLoc DL(Op);
14979 EVT VT = Op.getValueType();
14980 EVT EltTy= VT.getVectorElementType();
14981
14982 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14983 return Op;
14984
14986 for (SDValue Lane : Op->ops()) {
14987 // For integer vectors, type legalization would have promoted the
14988 // operands already. Otherwise, if Op is a floating-point splat
14989 // (with operands cast to integers), then the only possibilities
14990 // are constants and UNDEFs.
14991 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
14992 Lane = DAG.getConstant(
14993 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
14994 DL, MVT::i32);
14995 } else if (Lane.getNode()->isUndef()) {
14996 Lane = DAG.getUNDEF(MVT::i32);
14997 } else {
14998 assert(Lane.getValueType() == MVT::i32 &&
14999 "Unexpected BUILD_VECTOR operand type");
15000 }
15001 Ops.push_back(Lane);
15002 }
15003 return DAG.getBuildVector(VT, DL, Ops);
15004}
15005
15007 const AArch64Subtarget *ST) {
15008 EVT VT = Op.getValueType();
15009 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15010 "Expected a legal NEON vector");
15011
15012 APInt DefBits(VT.getSizeInBits(), 0);
15013 APInt UndefBits(VT.getSizeInBits(), 0);
15014 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15015 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15016 auto TryMOVIWithBits = [&](APInt DefBits) {
15017 SDValue NewOp;
15018 if ((NewOp =
15019 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15020 (NewOp =
15021 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15022 (NewOp =
15023 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15024 (NewOp =
15025 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15026 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15027 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15028 return NewOp;
15029
15030 APInt NotDefBits = ~DefBits;
15031 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15032 NotDefBits)) ||
15033 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15034 NotDefBits)) ||
15035 (NewOp =
15036 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15037 return NewOp;
15038 return SDValue();
15039 };
15040 if (SDValue R = TryMOVIWithBits(DefBits))
15041 return R;
15042 if (SDValue R = TryMOVIWithBits(UndefBits))
15043 return R;
15044
15045 // See if a fneg of the constant can be materialized with a MOVI, etc
15046 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15047 // FNegate each sub-element of the constant
15048 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15049 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15050 .zext(VT.getSizeInBits());
15051 APInt NegBits(VT.getSizeInBits(), 0);
15052 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15053 for (unsigned i = 0; i < NumElts; i++)
15054 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15055 NegBits = DefBits ^ NegBits;
15056
15057 // Try to create the new constants with MOVI, and if so generate a fneg
15058 // for it.
15059 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15060 SDLoc DL(Op);
15061 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15062 return DAG.getNode(
15063 AArch64ISD::NVCAST, DL, VT,
15064 DAG.getNode(ISD::FNEG, DL, VFVT,
15065 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15066 }
15067 return SDValue();
15068 };
15069 SDValue R;
15070 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15071 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15072 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15073 return R;
15074 }
15075
15076 return SDValue();
15077}
15078
15079SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15080 SDValue Op, SelectionDAG &DAG) const {
15081 EVT VT = Op.getValueType();
15082 SDLoc DL(Op);
15083 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15084 auto *BVN = cast<BuildVectorSDNode>(Op);
15085
15086 if (auto SeqInfo = BVN->isConstantSequence()) {
15087 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15088 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15089 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15090 return convertFromScalableVector(DAG, VT, Seq);
15091 }
15092
15093 unsigned NumElems = VT.getVectorNumElements();
15094 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15095 NumElems <= 1 || BVN->isConstant())
15096 return SDValue();
15097
15098 auto IsExtractElt = [](SDValue Op) {
15099 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15100 };
15101
15102 // For integer types that are not already in vectors limit to at most four
15103 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15104 if (VT.getScalarType().isInteger() &&
15105 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15106 return SDValue();
15107
15108 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15109 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15110 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
15111 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15112 return Op.isUndef() ? Undef
15113 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15114 ContainerVT, Undef, Op, ZeroI64);
15115 });
15116
15117 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15118 while (Intermediates.size() > 1) {
15119 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15120
15121 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15122 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15123 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15124 Intermediates[I / 2] =
15125 Op1.isUndef() ? Op0
15126 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15127 }
15128
15129 Intermediates.resize(Intermediates.size() / 2);
15130 ZipEC = ZipEC.divideCoefficientBy(2);
15131 }
15132
15133 assert(Intermediates.size() == 1);
15134 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15135 return convertFromScalableVector(DAG, VT, Vec);
15136}
15137
15138SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15139 SelectionDAG &DAG) const {
15140 EVT VT = Op.getValueType();
15141
15142 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15143 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15144 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15145 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15146
15147 // Try to build a simple constant vector.
15148 Op = NormalizeBuildVector(Op, DAG);
15149 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15150 // abort.
15151 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15152 return SDValue();
15153
15154 // Certain vector constants, used to express things like logical NOT and
15155 // arithmetic NEG, are passed through unmodified. This allows special
15156 // patterns for these operations to match, which will lower these constants
15157 // to whatever is proven necessary.
15158 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15159 if (BVN->isConstant()) {
15160 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15161 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15162 APInt Val(BitSize,
15163 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15164 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15165 return Op;
15166 }
15167 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15168 if (Const->isZero() && !Const->isNegative())
15169 return Op;
15170 }
15171
15172 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15173 return V;
15174
15175 // Scan through the operands to find some interesting properties we can
15176 // exploit:
15177 // 1) If only one value is used, we can use a DUP, or
15178 // 2) if only the low element is not undef, we can just insert that, or
15179 // 3) if only one constant value is used (w/ some non-constant lanes),
15180 // we can splat the constant value into the whole vector then fill
15181 // in the non-constant lanes.
15182 // 4) FIXME: If different constant values are used, but we can intelligently
15183 // select the values we'll be overwriting for the non-constant
15184 // lanes such that we can directly materialize the vector
15185 // some other way (MOVI, e.g.), we can be sneaky.
15186 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15187 SDLoc DL(Op);
15188 unsigned NumElts = VT.getVectorNumElements();
15189 bool isOnlyLowElement = true;
15190 bool usesOnlyOneValue = true;
15191 bool usesOnlyOneConstantValue = true;
15192 bool isConstant = true;
15193 bool AllLanesExtractElt = true;
15194 unsigned NumConstantLanes = 0;
15195 unsigned NumDifferentLanes = 0;
15196 unsigned NumUndefLanes = 0;
15197 SDValue Value;
15198 SDValue ConstantValue;
15199 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15200 unsigned ConsecutiveValCount = 0;
15201 SDValue PrevVal;
15202 for (unsigned i = 0; i < NumElts; ++i) {
15203 SDValue V = Op.getOperand(i);
15204 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15205 AllLanesExtractElt = false;
15206 if (V.isUndef()) {
15207 ++NumUndefLanes;
15208 continue;
15209 }
15210 if (i > 0)
15211 isOnlyLowElement = false;
15212 if (!isIntOrFPConstant(V))
15213 isConstant = false;
15214
15215 if (isIntOrFPConstant(V)) {
15216 ++NumConstantLanes;
15217 if (!ConstantValue.getNode())
15218 ConstantValue = V;
15219 else if (ConstantValue != V)
15220 usesOnlyOneConstantValue = false;
15221 }
15222
15223 if (!Value.getNode())
15224 Value = V;
15225 else if (V != Value) {
15226 usesOnlyOneValue = false;
15227 ++NumDifferentLanes;
15228 }
15229
15230 if (PrevVal != V) {
15231 ConsecutiveValCount = 0;
15232 PrevVal = V;
15233 }
15234
15235 // Keep different values and its last consecutive count. For example,
15236 //
15237 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15238 // t24, t24, t24, t24, t24, t24, t24, t24
15239 // t23 = consecutive count 8
15240 // t24 = consecutive count 8
15241 // ------------------------------------------------------------------
15242 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15243 // t24, t24, t24, t24, t24, t24, t24, t24
15244 // t23 = consecutive count 5
15245 // t24 = consecutive count 9
15246 DifferentValueMap[V] = ++ConsecutiveValCount;
15247 }
15248
15249 if (!Value.getNode()) {
15250 LLVM_DEBUG(
15251 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15252 return DAG.getUNDEF(VT);
15253 }
15254
15255 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15256 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15257 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15258 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15259 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15260 "SCALAR_TO_VECTOR node\n");
15261 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15262 }
15263
15264 if (AllLanesExtractElt) {
15265 SDNode *Vector = nullptr;
15266 bool Even = false;
15267 bool Odd = false;
15268 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15269 // the Odd pattern <1,3,5,...>.
15270 for (unsigned i = 0; i < NumElts; ++i) {
15271 SDValue V = Op.getOperand(i);
15272 const SDNode *N = V.getNode();
15273 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15274 Even = false;
15275 Odd = false;
15276 break;
15277 }
15278 SDValue N0 = N->getOperand(0);
15279
15280 // All elements are extracted from the same vector.
15281 if (!Vector) {
15282 Vector = N0.getNode();
15283 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15284 // BUILD_VECTOR.
15285 if (VT.getVectorElementType() !=
15287 break;
15288 } else if (Vector != N0.getNode()) {
15289 Odd = false;
15290 Even = false;
15291 break;
15292 }
15293
15294 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15295 // indices <1,3,5,...>.
15296 uint64_t Val = N->getConstantOperandVal(1);
15297 if (Val == 2 * i) {
15298 Even = true;
15299 continue;
15300 }
15301 if (Val - 1 == 2 * i) {
15302 Odd = true;
15303 continue;
15304 }
15305
15306 // Something does not match: abort.
15307 Odd = false;
15308 Even = false;
15309 break;
15310 }
15311 if (Even || Odd) {
15312 SDValue LHS =
15314 DAG.getConstant(0, DL, MVT::i64));
15315 SDValue RHS =
15317 DAG.getConstant(NumElts, DL, MVT::i64));
15318
15319 if (Even && !Odd)
15320 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15321 if (Odd && !Even)
15322 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15323 }
15324 }
15325
15326 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15327 // i32 and try again.
15328 if (usesOnlyOneValue) {
15329 if (!isConstant) {
15330 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15331 Value.getValueType() != VT) {
15332 LLVM_DEBUG(
15333 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15334 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15335 }
15336
15337 // This is actually a DUPLANExx operation, which keeps everything vectory.
15338
15339 SDValue Lane = Value.getOperand(1);
15340 Value = Value.getOperand(0);
15341 if (Value.getValueSizeInBits() == 64) {
15342 LLVM_DEBUG(
15343 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15344 "widening it\n");
15345 Value = WidenVector(Value, DAG);
15346 }
15347
15348 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15349 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15350 }
15351
15354 EVT EltTy = VT.getVectorElementType();
15355 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15356 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15357 LLVM_DEBUG(
15358 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15359 "BITCASTS, and try again\n");
15360 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15361 for (unsigned i = 0; i < NumElts; ++i)
15362 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15363 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15364 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15365 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15366 Val.dump(););
15367 Val = LowerBUILD_VECTOR(Val, DAG);
15368 if (Val.getNode())
15369 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15370 }
15371 }
15372
15373 // If we need to insert a small number of different non-constant elements and
15374 // the vector width is sufficiently large, prefer using DUP with the common
15375 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15376 // skip the constant lane handling below.
15377 bool PreferDUPAndInsert =
15378 !isConstant && NumDifferentLanes >= 1 &&
15379 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15380 NumDifferentLanes >= NumConstantLanes;
15381
15382 // If there was only one constant value used and for more than one lane,
15383 // start by splatting that value, then replace the non-constant lanes. This
15384 // is better than the default, which will perform a separate initialization
15385 // for each lane.
15386 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15387 // Firstly, try to materialize the splat constant.
15388 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15389 unsigned BitSize = VT.getScalarSizeInBits();
15390 APInt ConstantValueAPInt(1, 0);
15391 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15392 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15393 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15394 !ConstantValueAPInt.isAllOnes()) {
15395 Val = ConstantBuildVector(Val, DAG, Subtarget);
15396 if (!Val)
15397 // Otherwise, materialize the constant and splat it.
15398 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15399 }
15400
15401 // Now insert the non-constant lanes.
15402 for (unsigned i = 0; i < NumElts; ++i) {
15403 SDValue V = Op.getOperand(i);
15404 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15405 if (!isIntOrFPConstant(V))
15406 // Note that type legalization likely mucked about with the VT of the
15407 // source operand, so we may have to convert it here before inserting.
15408 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15409 }
15410 return Val;
15411 }
15412
15413 // This will generate a load from the constant pool.
15414 if (isConstant) {
15415 LLVM_DEBUG(
15416 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15417 "expansion\n");
15418 return SDValue();
15419 }
15420
15421 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15422 // v4i32s. This is really a truncate, which we can construct out of (legal)
15423 // concats and truncate nodes.
15425 return M;
15426
15427 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15428 if (NumElts >= 4) {
15429 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15430 return Shuffle;
15431
15432 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15433 return Shuffle;
15434 }
15435
15436 if (PreferDUPAndInsert) {
15437 // First, build a constant vector with the common element.
15438 SmallVector<SDValue, 8> Ops(NumElts, Value);
15439 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15440 // Next, insert the elements that do not match the common value.
15441 for (unsigned I = 0; I < NumElts; ++I)
15442 if (Op.getOperand(I) != Value)
15443 NewVector =
15444 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15445 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15446
15447 return NewVector;
15448 }
15449
15450 // If vector consists of two different values, try to generate two DUPs and
15451 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15452 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15454 // Check the consecutive count of the value is the half number of vector
15455 // elements. In this case, we can use CONCAT_VECTORS. For example,
15456 //
15457 // canUseVECTOR_CONCAT = true;
15458 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15459 // t24, t24, t24, t24, t24, t24, t24, t24
15460 //
15461 // canUseVECTOR_CONCAT = false;
15462 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15463 // t24, t24, t24, t24, t24, t24, t24, t24
15464 bool canUseVECTOR_CONCAT = true;
15465 for (auto Pair : DifferentValueMap) {
15466 // Check different values have same length which is NumElts / 2.
15467 if (Pair.second != NumElts / 2)
15468 canUseVECTOR_CONCAT = false;
15469 Vals.push_back(Pair.first);
15470 }
15471
15472 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15473 // CONCAT_VECTORs. For example,
15474 //
15475 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15476 // t24, t24, t24, t24, t24, t24, t24, t24
15477 // ==>
15478 // t26: v8i8 = AArch64ISD::DUP t23
15479 // t28: v8i8 = AArch64ISD::DUP t24
15480 // t29: v16i8 = concat_vectors t26, t28
15481 if (canUseVECTOR_CONCAT) {
15482 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15483 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15484 SubVT.getVectorNumElements() >= 2) {
15485 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15486 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15487 SDValue DUP1 =
15488 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15489 SDValue DUP2 =
15490 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15492 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15493 return CONCAT_VECTORS;
15494 }
15495 }
15496
15497 // Let's try to generate VECTOR_SHUFFLE. For example,
15498 //
15499 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15500 // ==>
15501 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15502 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15503 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15504 if (NumElts >= 8) {
15505 SmallVector<int, 16> MaskVec;
15506 // Build mask for VECTOR_SHUFLLE.
15507 SDValue FirstLaneVal = Op.getOperand(0);
15508 for (unsigned i = 0; i < NumElts; ++i) {
15509 SDValue Val = Op.getOperand(i);
15510 if (FirstLaneVal == Val)
15511 MaskVec.push_back(i);
15512 else
15513 MaskVec.push_back(i + NumElts);
15514 }
15515
15516 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15517 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15518 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15519 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15521 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15522 return VECTOR_SHUFFLE;
15523 }
15524 }
15525
15526 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15527 // know the default expansion would otherwise fall back on something even
15528 // worse. For a vector with one or two non-undef values, that's
15529 // scalar_to_vector for the elements followed by a shuffle (provided the
15530 // shuffle is valid for the target) and materialization element by element
15531 // on the stack followed by a load for everything else.
15532 if (!isConstant && !usesOnlyOneValue) {
15533 LLVM_DEBUG(
15534 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15535 "of INSERT_VECTOR_ELT\n");
15536
15537 SDValue Vec = DAG.getUNDEF(VT);
15538 SDValue Op0 = Op.getOperand(0);
15539 unsigned i = 0;
15540
15541 // Use SCALAR_TO_VECTOR for lane zero to
15542 // a) Avoid a RMW dependency on the full vector register, and
15543 // b) Allow the register coalescer to fold away the copy if the
15544 // value is already in an S or D register, and we're forced to emit an
15545 // INSERT_SUBREG that we can't fold anywhere.
15546 //
15547 // We also allow types like i8 and i16 which are illegal scalar but legal
15548 // vector element types. After type-legalization the inserted value is
15549 // extended (i32) and it is safe to cast them to the vector type by ignoring
15550 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15551 if (!Op0.isUndef()) {
15552 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15553 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15554 ++i;
15555 }
15556 LLVM_DEBUG({
15557 if (i < NumElts)
15558 dbgs() << "Creating nodes for the other vector elements:\n";
15559 });
15560 for (; i < NumElts; ++i) {
15561 SDValue V = Op.getOperand(i);
15562 if (V.isUndef())
15563 continue;
15564 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15565 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15566 }
15567 return Vec;
15568 }
15569
15570 LLVM_DEBUG(
15571 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15572 "better alternative\n");
15573 return SDValue();
15574}
15575
15576SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15577 SelectionDAG &DAG) const {
15578 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15579 !Subtarget->isNeonAvailable()))
15580 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15581
15582 assert(Op.getValueType().isScalableVector() &&
15583 isTypeLegal(Op.getValueType()) &&
15584 "Expected legal scalable vector type!");
15585
15586 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15587 unsigned NumOperands = Op->getNumOperands();
15588 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15589 "Unexpected number of operands in CONCAT_VECTORS");
15590
15591 if (NumOperands == 2)
15592 return Op;
15593
15594 // Concat each pair of subvectors and pack into the lower half of the array.
15595 SmallVector<SDValue> ConcatOps(Op->ops());
15596 while (ConcatOps.size() > 1) {
15597 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15598 SDValue V1 = ConcatOps[I];
15599 SDValue V2 = ConcatOps[I + 1];
15600 EVT SubVT = V1.getValueType();
15601 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15602 ConcatOps[I / 2] =
15603 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15604 }
15605 ConcatOps.resize(ConcatOps.size() / 2);
15606 }
15607 return ConcatOps[0];
15608 }
15609
15610 return SDValue();
15611}
15612
15613SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15614 SelectionDAG &DAG) const {
15615 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15616
15617 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15618 !Subtarget->isNeonAvailable()))
15619 return LowerFixedLengthInsertVectorElt(Op, DAG);
15620
15621 EVT VT = Op.getOperand(0).getValueType();
15622
15623 if (VT.getScalarType() == MVT::i1) {
15624 EVT VectorVT = getPromotedVTForPredicate(VT);
15625 SDLoc DL(Op);
15626 SDValue ExtendedVector =
15627 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15628 SDValue ExtendedValue =
15629 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15630 VectorVT.getScalarType().getSizeInBits() < 32
15631 ? MVT::i32
15632 : VectorVT.getScalarType());
15633 ExtendedVector =
15634 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15635 ExtendedValue, Op.getOperand(2));
15636 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15637 }
15638
15639 // Check for non-constant or out of range lane.
15640 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15641 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15642 return SDValue();
15643
15644 return Op;
15645}
15646
15647SDValue
15648AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15649 SelectionDAG &DAG) const {
15650 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15651 EVT VT = Op.getOperand(0).getValueType();
15652
15653 if (VT.getScalarType() == MVT::i1) {
15654 // We can't directly extract from an SVE predicate; extend it first.
15655 // (This isn't the only possible lowering, but it's straightforward.)
15656 EVT VectorVT = getPromotedVTForPredicate(VT);
15657 SDLoc DL(Op);
15658 SDValue Extend =
15659 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15660 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15661 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15662 Extend, Op.getOperand(1));
15663 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15664 }
15665
15666 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15667 return LowerFixedLengthExtractVectorElt(Op, DAG);
15668
15669 // Check for non-constant or out of range lane.
15670 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15671 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15672 return SDValue();
15673
15674 // Insertion/extraction are legal for V128 types.
15675 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15676 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15677 VT == MVT::v8f16 || VT == MVT::v8bf16)
15678 return Op;
15679
15680 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15681 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15682 VT != MVT::v4bf16)
15683 return SDValue();
15684
15685 // For V64 types, we perform extraction by expanding the value
15686 // to a V128 type and perform the extraction on that.
15687 SDLoc DL(Op);
15688 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15689 EVT WideTy = WideVec.getValueType();
15690
15691 EVT ExtrTy = WideTy.getVectorElementType();
15692 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15693 ExtrTy = MVT::i32;
15694
15695 // For extractions, we just return the result directly.
15696 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15697 Op.getOperand(1));
15698}
15699
15700SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15701 SelectionDAG &DAG) const {
15702 EVT VT = Op.getValueType();
15704 "Only cases that extract a fixed length vector are supported!");
15705 EVT InVT = Op.getOperand(0).getValueType();
15706
15707 // If we don't have legal types yet, do nothing
15708 if (!isTypeLegal(InVT))
15709 return SDValue();
15710
15711 if (InVT.is128BitVector()) {
15712 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15713 unsigned Idx = Op.getConstantOperandVal(1);
15714
15715 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15716 if (Idx == 0)
15717 return Op;
15718
15719 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15720 // that directly.
15721 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15722 return Op;
15723 }
15724
15725 if (InVT.isScalableVector() ||
15726 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15727 SDLoc DL(Op);
15728 SDValue Vec = Op.getOperand(0);
15729 SDValue Idx = Op.getOperand(1);
15730
15732 if (PackedVT != InVT) {
15733 // Pack input into the bottom part of an SVE register and try again.
15734 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15735 DAG.getUNDEF(PackedVT), Vec,
15736 DAG.getVectorIdxConstant(0, DL));
15737 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15738 }
15739
15740 // This will get matched by custom code during ISelDAGToDAG.
15741 if (isNullConstant(Idx))
15742 return Op;
15743
15744 assert(InVT.isScalableVector() && "Unexpected vector type!");
15745 // Move requested subvector to the start of the vector and try again.
15746 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15747 return convertFromScalableVector(DAG, VT, Splice);
15748 }
15749
15750 return SDValue();
15751}
15752
15753SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15754 SelectionDAG &DAG) const {
15755 assert(Op.getValueType().isScalableVector() &&
15756 "Only expect to lower inserts into scalable vectors!");
15757
15758 EVT InVT = Op.getOperand(1).getValueType();
15759 unsigned Idx = Op.getConstantOperandVal(2);
15760
15761 SDValue Vec0 = Op.getOperand(0);
15762 SDValue Vec1 = Op.getOperand(1);
15763 SDLoc DL(Op);
15764 EVT VT = Op.getValueType();
15765
15766 if (InVT.isScalableVector()) {
15767 if (!isTypeLegal(VT))
15768 return SDValue();
15769
15770 // Break down insert_subvector into simpler parts.
15771 if (VT.getVectorElementType() == MVT::i1) {
15772 unsigned NumElts = VT.getVectorMinNumElements();
15773 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15774
15775 SDValue Lo, Hi;
15776 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15777 DAG.getVectorIdxConstant(0, DL));
15778 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15779 DAG.getVectorIdxConstant(NumElts / 2, DL));
15780 if (Idx < (NumElts / 2))
15781 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15783 else
15784 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15785 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15786
15787 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15788 }
15789
15790 // We can select these directly.
15791 if (isTypeLegal(InVT) && Vec0.isUndef())
15792 return Op;
15793
15794 // Ensure the subvector is half the size of the main vector.
15795 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15796 return SDValue();
15797
15798 // Here narrow and wide refers to the vector element types. After "casting"
15799 // both vectors must have the same bit length and so because the subvector
15800 // has fewer elements, those elements need to be bigger.
15803
15804 // NOP cast operands to the largest legal vector of the same element count.
15805 if (VT.isFloatingPoint()) {
15806 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15807 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15808 } else {
15809 // Legal integer vectors are already their largest so Vec0 is fine as is.
15810 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15811 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15812 }
15813
15814 // To replace the top/bottom half of vector V with vector SubV we widen the
15815 // preserved half of V, concatenate this to SubV (the order depending on the
15816 // half being replaced) and then narrow the result.
15817 SDValue Narrow;
15818 if (Idx == 0) {
15819 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15820 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15821 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15822 } else {
15824 "Invalid subvector index!");
15825 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15826 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15827 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15828 }
15829
15830 return getSVESafeBitCast(VT, Narrow, DAG);
15831 }
15832
15833 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15834 // This will be matched by custom code during ISelDAGToDAG.
15835 if (Vec0.isUndef())
15836 return Op;
15837
15838 std::optional<unsigned> PredPattern =
15840 auto PredTy = VT.changeVectorElementType(MVT::i1);
15841 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15842 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15843 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15844 }
15845
15846 return SDValue();
15847}
15848
15849static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15850 if (Op.getOpcode() != AArch64ISD::DUP &&
15851 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15852 Op.getOpcode() != ISD::BUILD_VECTOR)
15853 return false;
15854
15855 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15856 !isAllConstantBuildVector(Op, SplatVal))
15857 return false;
15858
15859 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15860 !isa<ConstantSDNode>(Op->getOperand(0)))
15861 return false;
15862
15863 SplatVal = Op->getConstantOperandVal(0);
15864 if (Op.getValueType().getVectorElementType() != MVT::i64)
15865 SplatVal = (int32_t)SplatVal;
15866
15867 Negated = false;
15868 if (isPowerOf2_64(SplatVal))
15869 return true;
15870
15871 Negated = true;
15872 if (isPowerOf2_64(-SplatVal)) {
15873 SplatVal = -SplatVal;
15874 return true;
15875 }
15876
15877 return false;
15878}
15879
15880SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15881 EVT VT = Op.getValueType();
15882 SDLoc DL(Op);
15883
15884 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15885 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15886
15887 assert(VT.isScalableVector() && "Expected a scalable vector.");
15888
15889 bool Signed = Op.getOpcode() == ISD::SDIV;
15890 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15891
15892 bool Negated;
15893 uint64_t SplatVal;
15894 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
15896 SDValue Res =
15897 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
15898 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
15899 if (Negated)
15900 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
15901
15902 return Res;
15903 }
15904
15905 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15906 return LowerToPredicatedOp(Op, DAG, PredOpcode);
15907
15908 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15909 // operations, and truncate the result.
15910 EVT WidenedVT;
15911 if (VT == MVT::nxv16i8)
15912 WidenedVT = MVT::nxv8i16;
15913 else if (VT == MVT::nxv8i16)
15914 WidenedVT = MVT::nxv4i32;
15915 else
15916 llvm_unreachable("Unexpected Custom DIV operation");
15917
15918 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15919 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15920 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
15921 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
15922 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
15923 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
15924 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
15925 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
15926 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
15927 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
15928 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
15929}
15930
15931bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15932 EVT VT, unsigned DefinedValues) const {
15933 if (!Subtarget->isNeonAvailable())
15934 return false;
15936}
15937
15939 // Currently no fixed length shuffles that require SVE are legal.
15940 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15941 return false;
15942
15943 if (VT.getVectorNumElements() == 4 &&
15944 (VT.is128BitVector() || VT.is64BitVector())) {
15945 unsigned Cost = getPerfectShuffleCost(M);
15946 if (Cost <= 1)
15947 return true;
15948 }
15949
15950 bool DummyBool;
15951 int DummyInt;
15952 unsigned DummyUnsigned;
15953
15954 unsigned EltSize = VT.getScalarSizeInBits();
15955 unsigned NumElts = VT.getVectorNumElements();
15957 isREVMask(M, EltSize, NumElts, 64) ||
15958 isREVMask(M, EltSize, NumElts, 32) ||
15959 isREVMask(M, EltSize, NumElts, 16) ||
15960 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
15961 isSingletonEXTMask(M, VT, DummyUnsigned) ||
15962 isTRNMask(M, NumElts, DummyUnsigned) ||
15963 isUZPMask(M, NumElts, DummyUnsigned) ||
15964 isZIPMask(M, NumElts, DummyUnsigned) ||
15965 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
15966 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
15967 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
15968 isINSMask(M, NumElts, DummyBool, DummyInt) ||
15969 isConcatMask(M, VT, VT.getSizeInBits() == 128));
15970}
15971
15973 EVT VT) const {
15974 // Just delegate to the generic legality, clear masks aren't special.
15975 return isShuffleMaskLegal(M, VT);
15976}
15977
15978/// getVShiftImm - Check if this is a valid build_vector for the immediate
15979/// operand of a vector shift operation, where all the elements of the
15980/// build_vector must have the same constant integer value.
15981static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15982 // Ignore bit_converts.
15983 while (Op.getOpcode() == ISD::BITCAST)
15984 Op = Op.getOperand(0);
15985 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
15986 APInt SplatBits, SplatUndef;
15987 unsigned SplatBitSize;
15988 bool HasAnyUndefs;
15989 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
15990 HasAnyUndefs, ElementBits) ||
15991 SplatBitSize > ElementBits)
15992 return false;
15993 Cnt = SplatBits.getSExtValue();
15994 return true;
15995}
15996
15997/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15998/// operand of a vector shift left operation. That value must be in the range:
15999/// 0 <= Value < ElementBits for a left shift; or
16000/// 0 <= Value <= ElementBits for a long left shift.
16001static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16002 assert(VT.isVector() && "vector shift count is not a vector type");
16003 int64_t ElementBits = VT.getScalarSizeInBits();
16004 if (!getVShiftImm(Op, ElementBits, Cnt))
16005 return false;
16006 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16007}
16008
16009/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16010/// operand of a vector shift right operation. The value must be in the range:
16011/// 1 <= Value <= ElementBits for a right shift; or
16012static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16013 assert(VT.isVector() && "vector shift count is not a vector type");
16014 int64_t ElementBits = VT.getScalarSizeInBits();
16015 if (!getVShiftImm(Op, ElementBits, Cnt))
16016 return false;
16017 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16018}
16019
16020SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16021 SelectionDAG &DAG) const {
16022 EVT VT = Op.getValueType();
16023
16024 if (VT.getScalarType() == MVT::i1) {
16025 // Lower i1 truncate to `(x & 1) != 0`.
16026 SDLoc DL(Op);
16027 EVT OpVT = Op.getOperand(0).getValueType();
16028 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16029 SDValue One = DAG.getConstant(1, DL, OpVT);
16030 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16031 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16032 }
16033
16034 if (!VT.isVector() || VT.isScalableVector())
16035 return SDValue();
16036
16037 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16038 !Subtarget->isNeonAvailable()))
16039 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16040
16041 return SDValue();
16042}
16043
16044// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16045// possibly a truncated type, it tells how many bits of the value are to be
16046// used.
16048 SelectionDAG &DAG,
16049 unsigned &ShiftValue,
16050 SDValue &RShOperand) {
16051 if (Shift->getOpcode() != ISD::SRL)
16052 return false;
16053
16054 EVT VT = Shift.getValueType();
16055 assert(VT.isScalableVT());
16056
16057 auto ShiftOp1 =
16058 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
16059 if (!ShiftOp1)
16060 return false;
16061
16062 ShiftValue = ShiftOp1->getZExtValue();
16063 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16064 return false;
16065
16066 SDValue Add = Shift->getOperand(0);
16067 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16068 return false;
16069
16071 "ResVT must be truncated or same type as the shift.");
16072 // Check if an overflow can lead to incorrect results.
16073 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16074 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16075 return false;
16076
16077 auto AddOp1 =
16078 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
16079 if (!AddOp1)
16080 return false;
16081 uint64_t AddValue = AddOp1->getZExtValue();
16082 if (AddValue != 1ULL << (ShiftValue - 1))
16083 return false;
16084
16085 RShOperand = Add->getOperand(0);
16086 return true;
16087}
16088
16089SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16090 SelectionDAG &DAG) const {
16091 EVT VT = Op.getValueType();
16092 SDLoc DL(Op);
16093 int64_t Cnt;
16094
16095 if (!Op.getOperand(1).getValueType().isVector())
16096 return Op;
16097 unsigned EltSize = VT.getScalarSizeInBits();
16098
16099 switch (Op.getOpcode()) {
16100 case ISD::SHL:
16101 if (VT.isScalableVector() ||
16103 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16104
16105 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16106 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16107 DAG.getConstant(Cnt, DL, MVT::i32));
16108 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
16109 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
16110 MVT::i32),
16111 Op.getOperand(0), Op.getOperand(1));
16112 case ISD::SRA:
16113 case ISD::SRL:
16114 if (VT.isScalableVector() &&
16115 (Subtarget->hasSVE2() ||
16116 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16117 SDValue RShOperand;
16118 unsigned ShiftValue;
16119 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16120 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16121 getPredicateForVector(DAG, DL, VT), RShOperand,
16122 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16123 }
16124
16125 if (VT.isScalableVector() ||
16126 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16127 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16128 : AArch64ISD::SRL_PRED;
16129 return LowerToPredicatedOp(Op, DAG, Opc);
16130 }
16131
16132 // Right shift immediate
16133 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16134 unsigned Opc =
16135 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16136 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16137 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
16138 }
16139
16140 // Right shift register. Note, there is not a shift right register
16141 // instruction, but the shift left register instruction takes a signed
16142 // value, where negative numbers specify a right shift.
16143 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16144 : Intrinsic::aarch64_neon_ushl;
16145 // negate the shift amount
16146 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16147 Op.getOperand(1));
16148 SDValue NegShiftLeft =
16150 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16151 NegShift);
16152 return NegShiftLeft;
16153 }
16154
16155 llvm_unreachable("unexpected shift opcode");
16156}
16157
16158SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16159 SelectionDAG &DAG) const {
16160 if (Op.getValueType().isScalableVector())
16161 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16162
16163 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16164 !Subtarget->isNeonAvailable()))
16165 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16166
16167 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16168 SDValue LHS = Op.getOperand(0);
16169 SDValue RHS = Op.getOperand(1);
16170 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16171 SDLoc DL(Op);
16172
16173 if (LHS.getValueType().getVectorElementType().isInteger())
16174 return Op;
16175
16176 assert(((!Subtarget->hasFullFP16() &&
16177 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16178 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16179 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16180 "Unexpected type!");
16181
16182 // Lower isnan(x) | isnan(never-nan) to x != x.
16183 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16184 if (CC == ISD::SETUO || CC == ISD::SETO) {
16185 bool OneNaN = false;
16186 if (LHS == RHS) {
16187 OneNaN = true;
16188 } else if (DAG.isKnownNeverNaN(RHS)) {
16189 OneNaN = true;
16190 RHS = LHS;
16191 } else if (DAG.isKnownNeverNaN(LHS)) {
16192 OneNaN = true;
16193 LHS = RHS;
16194 }
16195 if (OneNaN) {
16196 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16197 }
16198 }
16199
16200 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16201 // clean. Some of them require two branches to implement.
16202 AArch64CC::CondCode CC1, CC2;
16203 bool ShouldInvert;
16204 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16205
16206 bool NoNaNs =
16207 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16208 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16209 if (!Cmp.getNode())
16210 return SDValue();
16211
16212 if (CC2 != AArch64CC::AL) {
16213 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16214 if (!Cmp2.getNode())
16215 return SDValue();
16216
16217 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16218 }
16219
16220 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16221
16222 if (ShouldInvert)
16223 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16224
16225 return Cmp;
16226}
16227
16228static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16229 SelectionDAG &DAG) {
16230 SDValue VecOp = ScalarOp.getOperand(0);
16231 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16232 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16233 DAG.getConstant(0, DL, MVT::i64));
16234}
16235
16236static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16237 SDLoc DL, SelectionDAG &DAG) {
16238 unsigned ScalarOpcode;
16239 switch (Opcode) {
16240 case ISD::VECREDUCE_AND:
16241 ScalarOpcode = ISD::AND;
16242 break;
16243 case ISD::VECREDUCE_OR:
16244 ScalarOpcode = ISD::OR;
16245 break;
16246 case ISD::VECREDUCE_XOR:
16247 ScalarOpcode = ISD::XOR;
16248 break;
16249 default:
16250 llvm_unreachable("Expected bitwise vector reduction");
16251 return SDValue();
16252 }
16253
16254 EVT VecVT = Vec.getValueType();
16255 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16256 "Expected power-of-2 length vector");
16257
16258 EVT ElemVT = VecVT.getVectorElementType();
16259
16260 SDValue Result;
16261 unsigned NumElems = VecVT.getVectorNumElements();
16262
16263 // Special case for boolean reductions
16264 if (ElemVT == MVT::i1) {
16265 // Split large vectors into smaller ones
16266 if (NumElems > 16) {
16267 SDValue Lo, Hi;
16268 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16269 EVT HalfVT = Lo.getValueType();
16270 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16271 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16272 }
16273
16274 // Results of setcc operations get widened to 128 bits if their input
16275 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16276 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16277 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16278 // size leads to the best codegen, since e.g. setcc results might need to be
16279 // truncated otherwise.
16280 unsigned ExtendedWidth = 64;
16281 if (Vec.getOpcode() == ISD::SETCC &&
16282 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16283 ExtendedWidth = 128;
16284 }
16285 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16286
16287 // any_ext doesn't work with umin/umax, so only use it for uadd.
16288 unsigned ExtendOp =
16289 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16290 SDValue Extended = DAG.getNode(
16291 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16292 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16293 // in that case we bitcast the sign extended values from v2i64 to v4i32
16294 // before reduction for optimal code generation.
16295 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16296 NumElems == 2 && ExtendedWidth == 128) {
16297 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16298 ExtendedVT = MVT::i32;
16299 }
16300 switch (ScalarOpcode) {
16301 case ISD::AND:
16302 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16303 break;
16304 case ISD::OR:
16305 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16306 break;
16307 case ISD::XOR:
16308 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16309 break;
16310 default:
16311 llvm_unreachable("Unexpected Opcode");
16312 }
16313
16314 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16315 } else {
16316 // Iteratively split the vector in half and combine using the bitwise
16317 // operation until it fits in a 64 bit register.
16318 while (VecVT.getSizeInBits() > 64) {
16319 SDValue Lo, Hi;
16320 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16321 VecVT = Lo.getValueType();
16322 NumElems = VecVT.getVectorNumElements();
16323 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16324 }
16325
16326 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16327
16328 // Do the remaining work on a scalar since it allows the code generator to
16329 // combine the shift and bitwise operation into one instruction and since
16330 // integer instructions can have higher throughput than vector instructions.
16331 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16332
16333 // Iteratively combine the lower and upper halves of the scalar using the
16334 // bitwise operation, halving the relevant region of the scalar in each
16335 // iteration, until the relevant region is just one element of the original
16336 // vector.
16337 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16338 SDValue ShiftAmount =
16339 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16340 SDValue Shifted =
16341 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16342 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16343 }
16344
16345 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16346 }
16347
16348 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16349}
16350
16351SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16352 SelectionDAG &DAG) const {
16353 SDValue Src = Op.getOperand(0);
16354 EVT SrcVT = Src.getValueType();
16355
16356 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16357 // widening by inserting zeroes.
16358 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16359 SrcVT == MVT::v2f16) {
16360 SDLoc DL(Op);
16361 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16362 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16363 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16364 }
16365
16366 // Try to lower fixed length reductions to SVE.
16367 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16368 Op.getOpcode() == ISD::VECREDUCE_AND ||
16369 Op.getOpcode() == ISD::VECREDUCE_OR ||
16370 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16371 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16372 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16373 SrcVT.getVectorElementType() == MVT::i64);
16374 if (SrcVT.isScalableVector() ||
16376 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16377
16378 if (SrcVT.getVectorElementType() == MVT::i1)
16379 return LowerPredReductionToSVE(Op, DAG);
16380
16381 switch (Op.getOpcode()) {
16382 case ISD::VECREDUCE_ADD:
16383 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16384 case ISD::VECREDUCE_AND:
16385 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16386 case ISD::VECREDUCE_OR:
16387 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16389 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16391 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16393 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16395 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16396 case ISD::VECREDUCE_XOR:
16397 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16399 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16401 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16403 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16405 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16407 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16408 default:
16409 llvm_unreachable("Unhandled fixed length reduction");
16410 }
16411 }
16412
16413 // Lower NEON reductions.
16414 SDLoc DL(Op);
16415 switch (Op.getOpcode()) {
16416 case ISD::VECREDUCE_AND:
16417 case ISD::VECREDUCE_OR:
16418 case ISD::VECREDUCE_XOR:
16419 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16420 Op.getValueType(), DL, DAG);
16421 case ISD::VECREDUCE_ADD:
16422 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16424 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16426 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16428 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16430 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16431 default:
16432 llvm_unreachable("Unhandled reduction");
16433 }
16434}
16435
16436SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16437 SelectionDAG &DAG) const {
16438 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16439 // No point replacing if we don't have the relevant instruction/libcall anyway
16440 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16441 return SDValue();
16442
16443 // LSE has an atomic load-clear instruction, but not a load-and.
16444 SDLoc DL(Op);
16445 MVT VT = Op.getSimpleValueType();
16446 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16447 SDValue RHS = Op.getOperand(2);
16448 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16449 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16450 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16451 Op.getOperand(0), Op.getOperand(1), RHS,
16452 AN->getMemOperand());
16453}
16454
16455SDValue
16456AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16457 SelectionDAG &DAG) const {
16458
16459 SDLoc DL(Op);
16460 // Get the inputs.
16461 SDNode *Node = Op.getNode();
16462 SDValue Chain = Op.getOperand(0);
16463 SDValue Size = Op.getOperand(1);
16465 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16466 EVT VT = Node->getValueType(0);
16467
16469 "no-stack-arg-probe")) {
16470 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16471 Chain = SP.getValue(1);
16472 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16473 if (Align)
16474 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16475 DAG.getSignedConstant(-Align->value(), DL, VT));
16476 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16477 SDValue Ops[2] = {SP, Chain};
16478 return DAG.getMergeValues(Ops, DL);
16479 }
16480
16481 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16482
16483 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16485 PtrVT, 0);
16486
16487 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16488 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16489 if (Subtarget->hasCustomCallingConv())
16490 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16491
16492 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16493 DAG.getConstant(4, DL, MVT::i64));
16494 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16495 Chain =
16496 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16497 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16498 DAG.getRegisterMask(Mask), Chain.getValue(1));
16499 // To match the actual intent better, we should read the output from X15 here
16500 // again (instead of potentially spilling it to the stack), but rereading Size
16501 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16502 // here.
16503
16504 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16505 DAG.getConstant(4, DL, MVT::i64));
16506
16507 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16508 Chain = SP.getValue(1);
16509 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16510 if (Align)
16511 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16512 DAG.getSignedConstant(-Align->value(), DL, VT));
16513 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16514
16515 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16516
16517 SDValue Ops[2] = {SP, Chain};
16518 return DAG.getMergeValues(Ops, DL);
16519}
16520
16521SDValue
16522AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16523 SelectionDAG &DAG) const {
16524 // Get the inputs.
16525 SDNode *Node = Op.getNode();
16526 SDValue Chain = Op.getOperand(0);
16527 SDValue Size = Op.getOperand(1);
16528
16530 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16531 SDLoc DL(Op);
16532 EVT VT = Node->getValueType(0);
16533
16534 // Construct the new SP value in a GPR.
16535 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16536 Chain = SP.getValue(1);
16537 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16538 if (Align)
16539 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16540 DAG.getSignedConstant(-Align->value(), DL, VT));
16541
16542 // Set the real SP to the new value with a probing loop.
16543 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16544 SDValue Ops[2] = {SP, Chain};
16545 return DAG.getMergeValues(Ops, DL);
16546}
16547
16548SDValue
16549AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16550 SelectionDAG &DAG) const {
16552
16553 if (Subtarget->isTargetWindows())
16554 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16555 else if (hasInlineStackProbe(MF))
16556 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16557 else
16558 return SDValue();
16559}
16560
16561SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16562 unsigned NewOp) const {
16563 if (Subtarget->hasSVE2())
16564 return LowerToPredicatedOp(Op, DAG, NewOp);
16565
16566 // Default to expand.
16567 return SDValue();
16568}
16569
16570SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16571 SelectionDAG &DAG) const {
16572 EVT VT = Op.getValueType();
16573 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16574
16575 SDLoc DL(Op);
16576 APInt MulImm = Op.getConstantOperandAPInt(0);
16577 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16578 VT);
16579}
16580
16581/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16582template <unsigned NumVecs>
16583static bool
16587 // Retrieve EC from first vector argument.
16588 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16590#ifndef NDEBUG
16591 // Check the assumption that all input vectors are the same type.
16592 for (unsigned I = 0; I < NumVecs; ++I)
16593 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16594 "Invalid type.");
16595#endif
16596 // memVT is `NumVecs * VT`.
16598 EC * NumVecs);
16599 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16600 Info.offset = 0;
16601 Info.align.reset();
16603 return true;
16604}
16605
16606/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16607/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16608/// specified in the intrinsic calls.
16610 const CallInst &I,
16611 MachineFunction &MF,
16612 unsigned Intrinsic) const {
16613 auto &DL = I.getDataLayout();
16614 switch (Intrinsic) {
16615 case Intrinsic::aarch64_sve_st2:
16616 return setInfoSVEStN<2>(*this, DL, Info, I);
16617 case Intrinsic::aarch64_sve_st3:
16618 return setInfoSVEStN<3>(*this, DL, Info, I);
16619 case Intrinsic::aarch64_sve_st4:
16620 return setInfoSVEStN<4>(*this, DL, Info, I);
16621 case Intrinsic::aarch64_neon_ld2:
16622 case Intrinsic::aarch64_neon_ld3:
16623 case Intrinsic::aarch64_neon_ld4:
16624 case Intrinsic::aarch64_neon_ld1x2:
16625 case Intrinsic::aarch64_neon_ld1x3:
16626 case Intrinsic::aarch64_neon_ld1x4: {
16628 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16629 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16630 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16631 Info.offset = 0;
16632 Info.align.reset();
16633 // volatile loads with NEON intrinsics not supported
16635 return true;
16636 }
16637 case Intrinsic::aarch64_neon_ld2lane:
16638 case Intrinsic::aarch64_neon_ld3lane:
16639 case Intrinsic::aarch64_neon_ld4lane:
16640 case Intrinsic::aarch64_neon_ld2r:
16641 case Intrinsic::aarch64_neon_ld3r:
16642 case Intrinsic::aarch64_neon_ld4r: {
16644 // ldx return struct with the same vec type
16645 Type *RetTy = I.getType();
16646 auto *StructTy = cast<StructType>(RetTy);
16647 unsigned NumElts = StructTy->getNumElements();
16648 Type *VecTy = StructTy->getElementType(0);
16649 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16650 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16651 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16652 Info.offset = 0;
16653 Info.align.reset();
16654 // volatile loads with NEON intrinsics not supported
16656 return true;
16657 }
16658 case Intrinsic::aarch64_neon_st2:
16659 case Intrinsic::aarch64_neon_st3:
16660 case Intrinsic::aarch64_neon_st4:
16661 case Intrinsic::aarch64_neon_st1x2:
16662 case Intrinsic::aarch64_neon_st1x3:
16663 case Intrinsic::aarch64_neon_st1x4: {
16665 unsigned NumElts = 0;
16666 for (const Value *Arg : I.args()) {
16667 Type *ArgTy = Arg->getType();
16668 if (!ArgTy->isVectorTy())
16669 break;
16670 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16671 }
16672 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16673 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16674 Info.offset = 0;
16675 Info.align.reset();
16676 // volatile stores with NEON intrinsics not supported
16678 return true;
16679 }
16680 case Intrinsic::aarch64_neon_st2lane:
16681 case Intrinsic::aarch64_neon_st3lane:
16682 case Intrinsic::aarch64_neon_st4lane: {
16684 unsigned NumElts = 0;
16685 // all the vector type is same
16686 Type *VecTy = I.getArgOperand(0)->getType();
16687 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16688
16689 for (const Value *Arg : I.args()) {
16690 Type *ArgTy = Arg->getType();
16691 if (!ArgTy->isVectorTy())
16692 break;
16693 NumElts += 1;
16694 }
16695
16696 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16697 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16698 Info.offset = 0;
16699 Info.align.reset();
16700 // volatile stores with NEON intrinsics not supported
16702 return true;
16703 }
16704 case Intrinsic::aarch64_ldaxr:
16705 case Intrinsic::aarch64_ldxr: {
16706 Type *ValTy = I.getParamElementType(0);
16708 Info.memVT = MVT::getVT(ValTy);
16709 Info.ptrVal = I.getArgOperand(0);
16710 Info.offset = 0;
16711 Info.align = DL.getABITypeAlign(ValTy);
16713 return true;
16714 }
16715 case Intrinsic::aarch64_stlxr:
16716 case Intrinsic::aarch64_stxr: {
16717 Type *ValTy = I.getParamElementType(1);
16719 Info.memVT = MVT::getVT(ValTy);
16720 Info.ptrVal = I.getArgOperand(1);
16721 Info.offset = 0;
16722 Info.align = DL.getABITypeAlign(ValTy);
16724 return true;
16725 }
16726 case Intrinsic::aarch64_ldaxp:
16727 case Intrinsic::aarch64_ldxp:
16729 Info.memVT = MVT::i128;
16730 Info.ptrVal = I.getArgOperand(0);
16731 Info.offset = 0;
16732 Info.align = Align(16);
16734 return true;
16735 case Intrinsic::aarch64_stlxp:
16736 case Intrinsic::aarch64_stxp:
16738 Info.memVT = MVT::i128;
16739 Info.ptrVal = I.getArgOperand(2);
16740 Info.offset = 0;
16741 Info.align = Align(16);
16743 return true;
16744 case Intrinsic::aarch64_sve_ldnt1: {
16745 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16747 Info.memVT = MVT::getVT(I.getType());
16748 Info.ptrVal = I.getArgOperand(1);
16749 Info.offset = 0;
16750 Info.align = DL.getABITypeAlign(ElTy);
16752 return true;
16753 }
16754 case Intrinsic::aarch64_sve_stnt1: {
16755 Type *ElTy =
16756 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16758 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16759 Info.ptrVal = I.getArgOperand(2);
16760 Info.offset = 0;
16761 Info.align = DL.getABITypeAlign(ElTy);
16763 return true;
16764 }
16765 case Intrinsic::aarch64_mops_memset_tag: {
16766 Value *Dst = I.getArgOperand(0);
16767 Value *Val = I.getArgOperand(1);
16769 Info.memVT = MVT::getVT(Val->getType());
16770 Info.ptrVal = Dst;
16771 Info.offset = 0;
16772 Info.align = I.getParamAlign(0).valueOrOne();
16774 // The size of the memory being operated on is unknown at this point
16776 return true;
16777 }
16778 default:
16779 break;
16780 }
16781
16782 return false;
16783}
16784
16786 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
16787 std::optional<unsigned> ByteOffset) const {
16788 // TODO: This may be worth removing. Check regression tests for diffs.
16789 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
16790 ByteOffset))
16791 return false;
16792
16793 // If we're reducing the load width in order to avoid having to use an extra
16794 // instruction to do extension then it's probably a good idea.
16795 if (ExtTy != ISD::NON_EXTLOAD)
16796 return true;
16797 // Don't reduce load width if it would prevent us from combining a shift into
16798 // the offset.
16799 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16800 assert(Mem);
16801 const SDValue &Base = Mem->getBasePtr();
16802 if (Base.getOpcode() == ISD::ADD &&
16803 Base.getOperand(1).getOpcode() == ISD::SHL &&
16804 Base.getOperand(1).hasOneUse() &&
16805 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16806 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16807 if (Mem->getMemoryVT().isScalableVector())
16808 return false;
16809 // The shift can be combined if it matches the size of the value being
16810 // loaded (and so reducing the width would make it not match).
16811 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16812 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16813 if (ShiftAmount == Log2_32(LoadBytes))
16814 return false;
16815 }
16816 // We have no reason to disallow reducing the load width, so allow it.
16817 return true;
16818}
16819
16820// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16822 EVT VT = Extend.getValueType();
16823 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16824 SDValue Extract = Extend.getOperand(0);
16825 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16826 Extract = Extract.getOperand(0);
16827 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16828 EVT VecVT = Extract.getOperand(0).getValueType();
16829 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16830 return false;
16831 }
16832 }
16833 return true;
16834}
16835
16836// Truncations from 64-bit GPR to 32-bit GPR is free.
16838 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16839 return false;
16840 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16841 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16842 return NumBits1 > NumBits2;
16843}
16845 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16846 return false;
16847 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16848 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16849 return NumBits1 > NumBits2;
16850}
16851
16852/// Check if it is profitable to hoist instruction in then/else to if.
16853/// Not profitable if I and it's user can form a FMA instruction
16854/// because we prefer FMSUB/FMADD.
16856 if (I->getOpcode() != Instruction::FMul)
16857 return true;
16858
16859 if (!I->hasOneUse())
16860 return true;
16861
16862 Instruction *User = I->user_back();
16863
16864 if (!(User->getOpcode() == Instruction::FSub ||
16865 User->getOpcode() == Instruction::FAdd))
16866 return true;
16867
16869 const Function *F = I->getFunction();
16870 const DataLayout &DL = F->getDataLayout();
16871 Type *Ty = User->getOperand(0)->getType();
16872
16873 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
16875 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16876 I->getFastMathFlags().allowContract()));
16877}
16878
16879// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16880// 64-bit GPR.
16882 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16883 return false;
16884 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16885 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16886 return NumBits1 == 32 && NumBits2 == 64;
16887}
16889 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16890 return false;
16891 unsigned NumBits1 = VT1.getSizeInBits();
16892 unsigned NumBits2 = VT2.getSizeInBits();
16893 return NumBits1 == 32 && NumBits2 == 64;
16894}
16895
16897 EVT VT1 = Val.getValueType();
16898 if (isZExtFree(VT1, VT2)) {
16899 return true;
16900 }
16901
16902 if (Val.getOpcode() != ISD::LOAD)
16903 return false;
16904
16905 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16906 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16907 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16908 VT1.getSizeInBits() <= 32);
16909}
16910
16911bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16912 if (isa<FPExtInst>(Ext))
16913 return false;
16914
16915 // Vector types are not free.
16916 if (Ext->getType()->isVectorTy())
16917 return false;
16918
16919 for (const Use &U : Ext->uses()) {
16920 // The extension is free if we can fold it with a left shift in an
16921 // addressing mode or an arithmetic operation: add, sub, and cmp.
16922
16923 // Is there a shift?
16924 const Instruction *Instr = cast<Instruction>(U.getUser());
16925
16926 // Is this a constant shift?
16927 switch (Instr->getOpcode()) {
16928 case Instruction::Shl:
16929 if (!isa<ConstantInt>(Instr->getOperand(1)))
16930 return false;
16931 break;
16932 case Instruction::GetElementPtr: {
16933 gep_type_iterator GTI = gep_type_begin(Instr);
16934 auto &DL = Ext->getDataLayout();
16935 std::advance(GTI, U.getOperandNo()-1);
16936 Type *IdxTy = GTI.getIndexedType();
16937 // This extension will end up with a shift because of the scaling factor.
16938 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16939 // Get the shift amount based on the scaling factor:
16940 // log2(sizeof(IdxTy)) - log2(8).
16941 if (IdxTy->isScalableTy())
16942 return false;
16943 uint64_t ShiftAmt =
16944 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
16945 3;
16946 // Is the constant foldable in the shift of the addressing mode?
16947 // I.e., shift amount is between 1 and 4 inclusive.
16948 if (ShiftAmt == 0 || ShiftAmt > 4)
16949 return false;
16950 break;
16951 }
16952 case Instruction::Trunc:
16953 // Check if this is a noop.
16954 // trunc(sext ty1 to ty2) to ty1.
16955 if (Instr->getType() == Ext->getOperand(0)->getType())
16956 continue;
16957 [[fallthrough]];
16958 default:
16959 return false;
16960 }
16961
16962 // At this point we can use the bfm family, so this extension is free
16963 // for that use.
16964 }
16965 return true;
16966}
16967
16968static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16969 unsigned NumElts, bool IsLittleEndian,
16970 SmallVectorImpl<int> &Mask) {
16971 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16972 return false;
16973
16974 assert(DstWidth % SrcWidth == 0 &&
16975 "TBL lowering is not supported for a conversion instruction with this "
16976 "source and destination element type.");
16977
16978 unsigned Factor = DstWidth / SrcWidth;
16979 unsigned MaskLen = NumElts * Factor;
16980
16981 Mask.clear();
16982 Mask.resize(MaskLen, NumElts);
16983
16984 unsigned SrcIndex = 0;
16985 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16986 Mask[I] = SrcIndex++;
16987
16988 return true;
16989}
16990
16992 FixedVectorType *ZExtTy,
16993 FixedVectorType *DstTy,
16994 bool IsLittleEndian) {
16995 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16996 unsigned NumElts = SrcTy->getNumElements();
16997 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16998 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16999
17000 SmallVector<int> Mask;
17001 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17002 return nullptr;
17003
17004 auto *FirstEltZero = Builder.CreateInsertElement(
17005 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17006 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17007 Result = Builder.CreateBitCast(Result, DstTy);
17008 if (DstTy != ZExtTy)
17009 Result = Builder.CreateZExt(Result, ZExtTy);
17010 return Result;
17011}
17012
17014 FixedVectorType *DstTy,
17015 bool IsLittleEndian) {
17016 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17017 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17018 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17019
17020 SmallVector<int> Mask;
17021 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17022 !IsLittleEndian, Mask))
17023 return nullptr;
17024
17025 auto *FirstEltZero = Builder.CreateInsertElement(
17026 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17027
17028 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17029}
17030
17031static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17032 IRBuilder<> Builder(TI);
17034 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17035 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17036 auto *DstTy = cast<FixedVectorType>(TI->getType());
17037 assert(SrcTy->getElementType()->isIntegerTy() &&
17038 "Non-integer type source vector element is not supported");
17039 assert(DstTy->getElementType()->isIntegerTy(8) &&
17040 "Unsupported destination vector element type");
17041 unsigned SrcElemTySz =
17042 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17043 unsigned DstElemTySz =
17044 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17045 assert((SrcElemTySz % DstElemTySz == 0) &&
17046 "Cannot lower truncate to tbl instructions for a source element size "
17047 "that is not divisible by the destination element size");
17048 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17049 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17050 "Unsupported source vector element type size");
17051 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17052
17053 // Create a mask to choose every nth byte from the source vector table of
17054 // bytes to create the truncated destination vector, where 'n' is the truncate
17055 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17056 // 0,8,16,..Y*8th bytes for the little-endian format
17058 for (int Itr = 0; Itr < 16; Itr++) {
17059 if (Itr < NumElements)
17060 MaskConst.push_back(Builder.getInt8(
17061 IsLittleEndian ? Itr * TruncFactor
17062 : Itr * TruncFactor + (TruncFactor - 1)));
17063 else
17064 MaskConst.push_back(Builder.getInt8(255));
17065 }
17066
17067 int MaxTblSz = 128 * 4;
17068 int MaxSrcSz = SrcElemTySz * NumElements;
17069 int ElemsPerTbl =
17070 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17071 assert(ElemsPerTbl <= 16 &&
17072 "Maximum elements selected using TBL instruction cannot exceed 16!");
17073
17074 int ShuffleCount = 128 / SrcElemTySz;
17075 SmallVector<int> ShuffleLanes;
17076 for (int i = 0; i < ShuffleCount; ++i)
17077 ShuffleLanes.push_back(i);
17078
17079 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17080 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17081 // call TBL & save the result in a vector of TBL results for combining later.
17083 while (ShuffleLanes.back() < NumElements) {
17084 Parts.push_back(Builder.CreateBitCast(
17085 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17086
17087 if (Parts.size() == 4) {
17088 Parts.push_back(ConstantVector::get(MaskConst));
17089 Results.push_back(
17090 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17091 Parts.clear();
17092 }
17093
17094 for (int i = 0; i < ShuffleCount; ++i)
17095 ShuffleLanes[i] += ShuffleCount;
17096 }
17097
17098 assert((Parts.empty() || Results.empty()) &&
17099 "Lowering trunc for vectors requiring different TBL instructions is "
17100 "not supported!");
17101 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17102 // registers
17103 if (!Parts.empty()) {
17104 Intrinsic::ID TblID;
17105 switch (Parts.size()) {
17106 case 1:
17107 TblID = Intrinsic::aarch64_neon_tbl1;
17108 break;
17109 case 2:
17110 TblID = Intrinsic::aarch64_neon_tbl2;
17111 break;
17112 case 3:
17113 TblID = Intrinsic::aarch64_neon_tbl3;
17114 break;
17115 }
17116
17117 Parts.push_back(ConstantVector::get(MaskConst));
17118 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17119 }
17120
17121 // Extract the destination vector from TBL result(s) after combining them
17122 // where applicable. Currently, at most two TBLs are supported.
17123 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17124 "more than 2 tbl instructions!");
17125 Value *FinalResult = Results[0];
17126 if (Results.size() == 1) {
17127 if (ElemsPerTbl < 16) {
17128 SmallVector<int> FinalMask(ElemsPerTbl);
17129 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17130 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17131 }
17132 } else {
17133 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17134 if (ElemsPerTbl < 16) {
17135 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17136 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17137 } else {
17138 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17139 }
17140 FinalResult =
17141 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17142 }
17143
17144 TI->replaceAllUsesWith(FinalResult);
17145 TI->eraseFromParent();
17146}
17147
17149 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17150 // shuffle_vector instructions are serialized when targeting SVE,
17151 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17152 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17153 return false;
17154
17155 // Try to optimize conversions using tbl. This requires materializing constant
17156 // index vectors, which can increase code size and add loads. Skip the
17157 // transform unless the conversion is in a loop block guaranteed to execute
17158 // and we are not optimizing for size.
17159 Function *F = I->getParent()->getParent();
17160 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17161 return false;
17162
17163 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17164 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17165 if (!SrcTy || !DstTy)
17166 return false;
17167
17168 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17169 // lowered to tbl instructions to insert the original i8 elements
17170 // into i8x lanes. This is enabled for cases where it is beneficial.
17171 auto *ZExt = dyn_cast<ZExtInst>(I);
17172 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17173 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17174 if (DstWidth % 8 != 0)
17175 return false;
17176
17177 auto *TruncDstType =
17178 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
17179 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17180 // the remaining ZExt folded into the user, don't use tbl lowering.
17181 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17182 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17185 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17186 return false;
17187
17188 DstTy = TruncDstType;
17189 }
17190
17191 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17192 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17193 // most one extra extend step is needed and using tbl is not profitable.
17194 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17195 // udot instruction.
17196 if (SrcWidth * 4 <= DstWidth) {
17197 if (all_of(I->users(), [&](auto *U) {
17198 auto *SingleUser = cast<Instruction>(&*U);
17199 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17200 return true;
17201 if (match(SingleUser,
17202 m_Intrinsic<
17203 Intrinsic::experimental_vector_partial_reduce_add>(
17204 m_Value(), m_Specific(I))))
17205 return true;
17206 return false;
17207 }))
17208 return false;
17209 }
17210
17211 if (DstTy->getScalarSizeInBits() >= 64)
17212 return false;
17213
17214 IRBuilder<> Builder(ZExt);
17216 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17217 DstTy, Subtarget->isLittleEndian());
17218 if (!Result)
17219 return false;
17220 ZExt->replaceAllUsesWith(Result);
17221 ZExt->eraseFromParent();
17222 return true;
17223 }
17224
17225 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17226 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17227 DstTy->getElementType()->isFloatTy()) ||
17228 (SrcTy->getElementType()->isIntegerTy(16) &&
17229 DstTy->getElementType()->isDoubleTy()))) {
17230 IRBuilder<> Builder(I);
17232 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17233 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17234 assert(ZExt && "Cannot fail for the i8 to float conversion");
17235 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17236 I->replaceAllUsesWith(UI);
17237 I->eraseFromParent();
17238 return true;
17239 }
17240
17241 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17242 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17243 DstTy->getElementType()->isFloatTy()) {
17244 IRBuilder<> Builder(I);
17245 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17247 Subtarget->isLittleEndian());
17248 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17249 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17250 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17251 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17252 I->replaceAllUsesWith(SI);
17253 I->eraseFromParent();
17254 return true;
17255 }
17256
17257 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17258 // followed by a truncate lowered to using tbl.4.
17259 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17260 if (FPToUI &&
17261 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17262 SrcTy->getElementType()->isFloatTy() &&
17263 DstTy->getElementType()->isIntegerTy(8)) {
17264 IRBuilder<> Builder(I);
17265 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17266 VectorType::getInteger(SrcTy));
17267 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17268 I->replaceAllUsesWith(TruncI);
17269 I->eraseFromParent();
17270 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17271 return true;
17272 }
17273
17274 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17275 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17276 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17277 // registers
17278 auto *TI = dyn_cast<TruncInst>(I);
17279 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17280 ((SrcTy->getElementType()->isIntegerTy(32) ||
17281 SrcTy->getElementType()->isIntegerTy(64)) &&
17282 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17283 createTblForTrunc(TI, Subtarget->isLittleEndian());
17284 return true;
17285 }
17286
17287 return false;
17288}
17289
17291 Align &RequiredAlignment) const {
17292 if (!LoadedType.isSimple() ||
17293 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17294 return false;
17295 // Cyclone supports unaligned accesses.
17296 RequiredAlignment = Align(1);
17297 unsigned NumBits = LoadedType.getSizeInBits();
17298 return NumBits == 32 || NumBits == 64;
17299}
17300
17301/// A helper function for determining the number of interleaved accesses we
17302/// will generate when lowering accesses of the given type.
17304 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17305 unsigned VecSize = 128;
17306 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17307 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17308 if (UseScalable && isa<FixedVectorType>(VecTy))
17309 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17310 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17311}
17312
17315 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17316 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17317 return MOStridedAccess;
17319}
17320
17322 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17323 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17324 auto EC = VecTy->getElementCount();
17325 unsigned MinElts = EC.getKnownMinValue();
17326
17327 UseScalable = false;
17328
17329 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17330 (!Subtarget->useSVEForFixedLengthVectors() ||
17332 return false;
17333
17334 if (isa<ScalableVectorType>(VecTy) &&
17335 !Subtarget->isSVEorStreamingSVEAvailable())
17336 return false;
17337
17338 // Ensure the number of vector elements is greater than 1.
17339 if (MinElts < 2)
17340 return false;
17341
17342 // Ensure the element type is legal.
17343 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17344 return false;
17345
17346 if (EC.isScalable()) {
17347 UseScalable = true;
17348 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17349 }
17350
17351 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17352 if (Subtarget->useSVEForFixedLengthVectors()) {
17353 unsigned MinSVEVectorSize =
17354 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17355 if (VecSize % MinSVEVectorSize == 0 ||
17356 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17357 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17358 UseScalable = true;
17359 return true;
17360 }
17361 }
17362
17363 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17364 // 128 will be split into multiple interleaved accesses.
17365 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17366}
17367
17369 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17370 return ScalableVectorType::get(VTy->getElementType(), 2);
17371
17372 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17373 return ScalableVectorType::get(VTy->getElementType(), 4);
17374
17375 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17376 return ScalableVectorType::get(VTy->getElementType(), 8);
17377
17378 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17379 return ScalableVectorType::get(VTy->getElementType(), 8);
17380
17381 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17382 return ScalableVectorType::get(VTy->getElementType(), 2);
17383
17384 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17385 return ScalableVectorType::get(VTy->getElementType(), 4);
17386
17387 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17388 return ScalableVectorType::get(VTy->getElementType(), 8);
17389
17390 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17391 return ScalableVectorType::get(VTy->getElementType(), 16);
17392
17393 llvm_unreachable("Cannot handle input vector type");
17394}
17395
17396static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17397 bool Scalable, Type *LDVTy,
17398 Type *PtrTy) {
17399 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17400 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17401 Intrinsic::aarch64_sve_ld3_sret,
17402 Intrinsic::aarch64_sve_ld4_sret};
17403 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17404 Intrinsic::aarch64_neon_ld3,
17405 Intrinsic::aarch64_neon_ld4};
17406 if (Scalable)
17407 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17408
17409 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17410 {LDVTy, PtrTy});
17411}
17412
17413static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17414 bool Scalable, Type *STVTy,
17415 Type *PtrTy) {
17416 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17417 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17418 Intrinsic::aarch64_sve_st3,
17419 Intrinsic::aarch64_sve_st4};
17420 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17421 Intrinsic::aarch64_neon_st3,
17422 Intrinsic::aarch64_neon_st4};
17423 if (Scalable)
17424 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17425
17426 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17427 {STVTy, PtrTy});
17428}
17429
17430/// Lower an interleaved load into a ldN intrinsic.
17431///
17432/// E.g. Lower an interleaved load (Factor = 2):
17433/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17434/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17435/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17436///
17437/// Into:
17438/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17439/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17440/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17442 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17443 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17444 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17445 "Invalid interleave factor");
17446 assert(!Shuffles.empty() && "Empty shufflevector input");
17447 assert(Shuffles.size() == Indices.size() &&
17448 "Unmatched number of shufflevectors and indices");
17449
17450 auto *LI = dyn_cast<LoadInst>(Load);
17451 if (!LI)
17452 return false;
17453 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17454
17455 const DataLayout &DL = LI->getDataLayout();
17456
17457 VectorType *VTy = Shuffles[0]->getType();
17458
17459 // Skip if we do not have NEON and skip illegal vector types. We can
17460 // "legalize" wide vector types into multiple interleaved accesses as long as
17461 // the vector types are divisible by 128.
17462 bool UseScalable;
17463 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17464 return false;
17465
17466 // Check if the interleave is a zext(shuffle), that can be better optimized
17467 // into shift / and masks. For the moment we do this just for uitofp (not
17468 // zext) to avoid issues with widening instructions.
17469 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17470 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17471 SI->getType()->getScalarSizeInBits() * 4 ==
17472 SI->user_back()->getType()->getScalarSizeInBits();
17473 }))
17474 return false;
17475
17476 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17477
17478 auto *FVTy = cast<FixedVectorType>(VTy);
17479
17480 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17481 // load integer vectors first and then convert to pointer vectors.
17482 Type *EltTy = FVTy->getElementType();
17483 if (EltTy->isPointerTy())
17484 FVTy =
17485 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17486
17487 // If we're going to generate more than one load, reset the sub-vector type
17488 // to something legal.
17489 FVTy = FixedVectorType::get(FVTy->getElementType(),
17490 FVTy->getNumElements() / NumLoads);
17491
17492 auto *LDVTy =
17493 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17494
17495 IRBuilder<> Builder(LI);
17496
17497 // The base address of the load.
17498 Value *BaseAddr = LI->getPointerOperand();
17499
17500 Type *PtrTy = LI->getPointerOperandType();
17501 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17502 LDVTy->getElementCount());
17503
17504 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17505 UseScalable, LDVTy, PtrTy);
17506
17507 // Holds sub-vectors extracted from the load intrinsic return values. The
17508 // sub-vectors are associated with the shufflevector instructions they will
17509 // replace.
17511
17512 Value *PTrue = nullptr;
17513 if (UseScalable) {
17514 std::optional<unsigned> PgPattern =
17515 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17516 if (Subtarget->getMinSVEVectorSizeInBits() ==
17517 Subtarget->getMaxSVEVectorSizeInBits() &&
17518 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17519 PgPattern = AArch64SVEPredPattern::all;
17520
17521 auto *PTruePat =
17522 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17523 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17524 {PTruePat});
17525 }
17526
17527 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17528
17529 // If we're generating more than one load, compute the base address of
17530 // subsequent loads as an offset from the previous.
17531 if (LoadCount > 0)
17532 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17533 FVTy->getNumElements() * Factor);
17534
17535 CallInst *LdN;
17536 if (UseScalable)
17537 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17538 else
17539 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17540
17541 // Extract and store the sub-vectors returned by the load intrinsic.
17542 for (unsigned i = 0; i < Shuffles.size(); i++) {
17543 ShuffleVectorInst *SVI = Shuffles[i];
17544 unsigned Index = Indices[i];
17545
17546 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17547
17548 if (UseScalable)
17549 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17550
17551 // Convert the integer vector to pointer vector if the element is pointer.
17552 if (EltTy->isPointerTy())
17553 SubVec = Builder.CreateIntToPtr(
17555 FVTy->getNumElements()));
17556
17557 SubVecs[SVI].push_back(SubVec);
17558 }
17559 }
17560
17561 // Replace uses of the shufflevector instructions with the sub-vectors
17562 // returned by the load intrinsic. If a shufflevector instruction is
17563 // associated with more than one sub-vector, those sub-vectors will be
17564 // concatenated into a single wide vector.
17565 for (ShuffleVectorInst *SVI : Shuffles) {
17566 auto &SubVec = SubVecs[SVI];
17567 auto *WideVec =
17568 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17569 SVI->replaceAllUsesWith(WideVec);
17570 }
17571
17572 return true;
17573}
17574
17575template <typename Iter>
17576bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17577 int MaxLookupDist = 20;
17578 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17579 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17580 const Value *PtrA1 =
17581 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17582
17583 while (++It != End) {
17584 if (It->isDebugOrPseudoInst())
17585 continue;
17586 if (MaxLookupDist-- == 0)
17587 break;
17588 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17589 const Value *PtrB1 =
17590 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17591 DL, OffsetB);
17592 if (PtrA1 == PtrB1 &&
17593 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17594 .abs() == 16)
17595 return true;
17596 }
17597 }
17598
17599 return false;
17600}
17601
17602/// Lower an interleaved store into a stN intrinsic.
17603///
17604/// E.g. Lower an interleaved store (Factor = 3):
17605/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17606/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17607/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17608///
17609/// Into:
17610/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17611/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17612/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17613/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17614///
17615/// Note that the new shufflevectors will be removed and we'll only generate one
17616/// st3 instruction in CodeGen.
17617///
17618/// Example for a more general valid mask (Factor 3). Lower:
17619/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17620/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17621/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17622///
17623/// Into:
17624/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17625/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17626/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17627/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17629 Value *LaneMask,
17630 ShuffleVectorInst *SVI,
17631 unsigned Factor,
17632 const APInt &GapMask) const {
17633
17634 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17635 "Invalid interleave factor");
17636 auto *SI = dyn_cast<StoreInst>(Store);
17637 if (!SI)
17638 return false;
17639 assert(!LaneMask && GapMask.popcount() == Factor &&
17640 "Unexpected mask on store");
17641
17642 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17643 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17644
17645 unsigned LaneLen = VecTy->getNumElements() / Factor;
17646 Type *EltTy = VecTy->getElementType();
17647 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17648
17649 const DataLayout &DL = SI->getDataLayout();
17650 bool UseScalable;
17651
17652 // Skip if we do not have NEON and skip illegal vector types. We can
17653 // "legalize" wide vector types into multiple interleaved accesses as long as
17654 // the vector types are divisible by 128.
17655 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17656 return false;
17657
17658 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17659
17660 Value *Op0 = SVI->getOperand(0);
17661 Value *Op1 = SVI->getOperand(1);
17662 IRBuilder<> Builder(SI);
17663
17664 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17665 // vectors to integer vectors.
17666 if (EltTy->isPointerTy()) {
17667 Type *IntTy = DL.getIntPtrType(EltTy);
17668 unsigned NumOpElts =
17669 cast<FixedVectorType>(Op0->getType())->getNumElements();
17670
17671 // Convert to the corresponding integer vector.
17672 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17673 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17674 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17675
17676 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17677 }
17678
17679 // If we're going to generate more than one store, reset the lane length
17680 // and sub-vector type to something legal.
17681 LaneLen /= NumStores;
17682 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17683
17684 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17685 : SubVecTy;
17686
17687 // The base address of the store.
17688 Value *BaseAddr = SI->getPointerOperand();
17689
17690 auto Mask = SVI->getShuffleMask();
17691
17692 // Sanity check if all the indices are NOT in range.
17693 // If mask is `poison`, `Mask` may be a vector of -1s.
17694 // If all of them are `poison`, OOB read will happen later.
17695 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17696 return false;
17697 }
17698 // A 64bit st2 which does not start at element 0 will involved adding extra
17699 // ext elements making the st2 unprofitable, and if there is a nearby store
17700 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17701 // zip;ldp pair which has higher throughput.
17702 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17703 (Mask[0] != 0 ||
17704 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17705 DL) ||
17706 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17707 BaseAddr, DL)))
17708 return false;
17709
17710 Type *PtrTy = SI->getPointerOperandType();
17711 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17712 STVTy->getElementCount());
17713
17714 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17715 UseScalable, STVTy, PtrTy);
17716
17717 Value *PTrue = nullptr;
17718 if (UseScalable) {
17719 std::optional<unsigned> PgPattern =
17720 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17721 if (Subtarget->getMinSVEVectorSizeInBits() ==
17722 Subtarget->getMaxSVEVectorSizeInBits() &&
17723 Subtarget->getMinSVEVectorSizeInBits() ==
17724 DL.getTypeSizeInBits(SubVecTy))
17725 PgPattern = AArch64SVEPredPattern::all;
17726
17727 auto *PTruePat =
17728 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17729 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17730 {PTruePat});
17731 }
17732
17733 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17734
17736
17737 // Split the shufflevector operands into sub vectors for the new stN call.
17738 for (unsigned i = 0; i < Factor; i++) {
17739 Value *Shuffle;
17740 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17741 if (Mask[IdxI] >= 0) {
17742 Shuffle = Builder.CreateShuffleVector(
17743 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17744 } else {
17745 unsigned StartMask = 0;
17746 for (unsigned j = 1; j < LaneLen; j++) {
17747 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17748 if (Mask[IdxJ] >= 0) {
17749 StartMask = Mask[IdxJ] - j;
17750 break;
17751 }
17752 }
17753 // Note: Filling undef gaps with random elements is ok, since
17754 // those elements were being written anyway (with undefs).
17755 // In the case of all undefs we're defaulting to using elems from 0
17756 // Note: StartMask cannot be negative, it's checked in
17757 // isReInterleaveMask
17758 Shuffle = Builder.CreateShuffleVector(
17759 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17760 }
17761
17762 if (UseScalable)
17763 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
17764 Shuffle, uint64_t(0));
17765
17766 Ops.push_back(Shuffle);
17767 }
17768
17769 if (UseScalable)
17770 Ops.push_back(PTrue);
17771
17772 // If we generating more than one store, we compute the base address of
17773 // subsequent stores as an offset from the previous.
17774 if (StoreCount > 0)
17775 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17776 BaseAddr, LaneLen * Factor);
17777
17778 Ops.push_back(BaseAddr);
17779 Builder.CreateCall(StNFunc, Ops);
17780 }
17781 return true;
17782}
17783
17785 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
17786 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
17787 if (Factor != 2 && Factor != 4) {
17788 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17789 return false;
17790 }
17791 auto *LI = dyn_cast<LoadInst>(Load);
17792 if (!LI)
17793 return false;
17794 assert(!Mask && "Unexpected mask on a load\n");
17795
17797
17798 const DataLayout &DL = LI->getModule()->getDataLayout();
17799 bool UseScalable;
17800 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17801 return false;
17802
17803 // TODO: Add support for using SVE instructions with fixed types later, using
17804 // the code from lowerInterleavedLoad to obtain the correct container type.
17805 if (UseScalable && !VTy->isScalableTy())
17806 return false;
17807
17808 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17809 VectorType *LdTy =
17811 VTy->getElementCount().divideCoefficientBy(NumLoads));
17812
17813 Type *PtrTy = LI->getPointerOperandType();
17814 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17815 UseScalable, LdTy, PtrTy);
17816
17817 IRBuilder<> Builder(LI);
17818 Value *Pred = nullptr;
17819 if (UseScalable)
17820 Pred =
17821 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17822
17823 Value *BaseAddr = LI->getPointerOperand();
17824 Value *Result = nullptr;
17825 if (NumLoads > 1) {
17826 // Create multiple legal small ldN.
17827 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17828 for (unsigned I = 0; I < NumLoads; ++I) {
17829 Value *Offset = Builder.getInt64(I * Factor);
17830
17831 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17832 Value *LdN = nullptr;
17833 if (UseScalable)
17834 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17835 else
17836 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17837 Value *Idx =
17838 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17839 for (unsigned J = 0; J < Factor; ++J) {
17840 ExtractedLdValues[J] = Builder.CreateInsertVector(
17841 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17842 }
17843 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17844 }
17845
17846 // Merge the values from different factors.
17847 Result = PoisonValue::get(DI->getType());
17848 for (unsigned J = 0; J < Factor; ++J)
17849 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
17850 } else {
17851 if (UseScalable)
17852 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17853 else
17854 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17855 }
17856
17857 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17858 DI->replaceAllUsesWith(Result);
17859 return true;
17860}
17861
17863 Instruction *Store, Value *Mask,
17864 ArrayRef<Value *> InterleavedValues) const {
17865 unsigned Factor = InterleavedValues.size();
17866 if (Factor != 2 && Factor != 4) {
17867 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17868 return false;
17869 }
17870 StoreInst *SI = dyn_cast<StoreInst>(Store);
17871 if (!SI)
17872 return false;
17873 assert(!Mask && "Unexpected mask on plain store");
17874
17875 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
17876 const DataLayout &DL = SI->getModule()->getDataLayout();
17877
17878 bool UseScalable;
17879 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17880 return false;
17881
17882 // TODO: Add support for using SVE instructions with fixed types later, using
17883 // the code from lowerInterleavedStore to obtain the correct container type.
17884 if (UseScalable && !VTy->isScalableTy())
17885 return false;
17886
17887 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17888
17889 VectorType *StTy =
17891 VTy->getElementCount().divideCoefficientBy(NumStores));
17892
17893 Type *PtrTy = SI->getPointerOperandType();
17894 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17895 UseScalable, StTy, PtrTy);
17896
17897 IRBuilder<> Builder(SI);
17898
17899 Value *BaseAddr = SI->getPointerOperand();
17900 Value *Pred = nullptr;
17901
17902 if (UseScalable)
17903 Pred =
17904 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17905
17906 auto ExtractedValues = InterleavedValues;
17907 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
17908 if (UseScalable)
17909 StoreOperands.push_back(Pred);
17910 StoreOperands.push_back(BaseAddr);
17911 for (unsigned I = 0; I < NumStores; ++I) {
17912 Value *Address = BaseAddr;
17913 if (NumStores > 1) {
17914 Value *Offset = Builder.getInt64(I * Factor);
17915 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17916 Value *Idx =
17917 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17918 for (unsigned J = 0; J < Factor; J++) {
17919 StoreOperands[J] =
17920 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
17921 }
17922 // update the address
17923 StoreOperands[StoreOperands.size() - 1] = Address;
17924 }
17925 Builder.CreateCall(StNFunc, StoreOperands);
17926 }
17927 return true;
17928}
17929
17931 LLVMContext &Context, const MemOp &Op,
17932 const AttributeList &FuncAttributes) const {
17933 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17934 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17935 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17936 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17937 // taken one instruction to materialize the v2i64 zero and one store (with
17938 // restrictive addressing mode). Just do i64 stores.
17939 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17940 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17941 if (Op.isAligned(AlignCheck))
17942 return true;
17943 unsigned Fast;
17944 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17946 Fast;
17947 };
17948
17949 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17950 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17951 return MVT::v16i8;
17952 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17953 return MVT::f128;
17954 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17955 return MVT::i64;
17956 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17957 return MVT::i32;
17958 return MVT::Other;
17959}
17960
17962 const MemOp &Op, const AttributeList &FuncAttributes) const {
17963 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17964 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17965 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17966 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17967 // taken one instruction to materialize the v2i64 zero and one store (with
17968 // restrictive addressing mode). Just do i64 stores.
17969 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17970 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17971 if (Op.isAligned(AlignCheck))
17972 return true;
17973 unsigned Fast;
17974 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17976 Fast;
17977 };
17978
17979 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17980 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17981 return LLT::fixed_vector(2, 64);
17982 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17983 return LLT::scalar(128);
17984 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17985 return LLT::scalar(64);
17986 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17987 return LLT::scalar(32);
17988 return LLT();
17989}
17990
17991// 12-bit optionally shifted immediates are legal for adds.
17993 if (Immed == std::numeric_limits<int64_t>::min()) {
17994 return false;
17995 }
17996 // Same encoding for add/sub, just flip the sign.
17997 return isLegalArithImmed((uint64_t)std::abs(Immed));
17998}
17999
18001 // We will only emit addvl/inc* instructions for SVE2
18002 if (!Subtarget->hasSVE2())
18003 return false;
18004
18005 // addvl's immediates are in terms of the number of bytes in a register.
18006 // Since there are 16 in the base supported size (128bits), we need to
18007 // divide the immediate by that much to give us a useful immediate to
18008 // multiply by vscale. We can't have a remainder as a result of this.
18009 if (Imm % 16 == 0)
18010 return isInt<6>(Imm / 16);
18011
18012 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18013 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18014 // of addvl as a result, so only take h|w|d into account.
18015 // Dec[h|w|d] will cover subtractions.
18016 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18017 // FIXME: Can we make use of other patterns to cover other immediates?
18018
18019 // inch|dech
18020 if (Imm % 8 == 0)
18021 return std::abs(Imm / 8) <= 16;
18022 // incw|decw
18023 if (Imm % 4 == 0)
18024 return std::abs(Imm / 4) <= 16;
18025 // incd|decd
18026 if (Imm % 2 == 0)
18027 return std::abs(Imm / 2) <= 16;
18028
18029 return false;
18030}
18031
18032// Return false to prevent folding
18033// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18034// if the folding leads to worse code.
18036 SDValue AddNode, SDValue ConstNode) const {
18037 // Let the DAGCombiner decide for vector types and large types.
18038 const EVT VT = AddNode.getValueType();
18039 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18040 return true;
18041
18042 // It is worse if c1 is legal add immediate, while c1*c2 is not
18043 // and has to be composed by at least two instructions.
18044 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18045 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18046 const int64_t C1 = C1Node->getSExtValue();
18047 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18049 return true;
18051 // Adapt to the width of a register.
18052 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18053 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18054 if (Insn.size() > 1)
18055 return false;
18056
18057 // Default to true and let the DAGCombiner decide.
18058 return true;
18059}
18060
18061// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18062// immediates is the same as for an add or a sub.
18064 return isLegalAddImmediate(Immed);
18065}
18066
18067/// isLegalAddressingMode - Return true if the addressing mode represented
18068/// by AM is legal for this target, for a load/store of the specified type.
18070 const AddrMode &AMode, Type *Ty,
18071 unsigned AS, Instruction *I) const {
18072 // AArch64 has five basic addressing modes:
18073 // reg
18074 // reg + 9-bit signed offset
18075 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18076 // reg1 + reg2
18077 // reg + SIZE_IN_BYTES * reg
18078
18079 // No global is ever allowed as a base.
18080 if (AMode.BaseGV)
18081 return false;
18082
18083 // No reg+reg+imm addressing.
18084 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18085 return false;
18086
18087 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18088 // `2*ScaledReg` into `BaseReg + ScaledReg`
18089 AddrMode AM = AMode;
18090 if (AM.Scale && !AM.HasBaseReg) {
18091 if (AM.Scale == 1) {
18092 AM.HasBaseReg = true;
18093 AM.Scale = 0;
18094 } else if (AM.Scale == 2) {
18095 AM.HasBaseReg = true;
18096 AM.Scale = 1;
18097 } else {
18098 return false;
18099 }
18100 }
18101
18102 // A base register is required in all addressing modes.
18103 if (!AM.HasBaseReg)
18104 return false;
18105
18106 if (Ty->isScalableTy()) {
18107 if (isa<ScalableVectorType>(Ty)) {
18108 // See if we have a foldable vscale-based offset, for vector types which
18109 // are either legal or smaller than the minimum; more work will be
18110 // required if we need to consider addressing for types which need
18111 // legalization by splitting.
18112 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18113 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18114 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18115 isPowerOf2_64(VecNumBytes))
18116 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18117
18118 uint64_t VecElemNumBytes =
18119 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18120 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18121 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18122 }
18123
18124 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18125 }
18126
18127 // No scalable offsets allowed for non-scalable types.
18128 if (AM.ScalableOffset)
18129 return false;
18130
18131 // check reg + imm case:
18132 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18133 uint64_t NumBytes = 0;
18134 if (Ty->isSized()) {
18135 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18136 NumBytes = NumBits / 8;
18137 if (!isPowerOf2_64(NumBits))
18138 NumBytes = 0;
18139 }
18140
18141 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18142 AM.Scale);
18143}
18144
18145// Check whether the 2 offsets belong to the same imm24 range, and their high
18146// 12bits are same, then their high part can be decoded with the offset of add.
18147int64_t
18149 int64_t MaxOffset) const {
18150 int64_t HighPart = MinOffset & ~0xfffULL;
18151 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18152 // Rebase the value to an integer multiple of imm12.
18153 return HighPart;
18154 }
18155
18156 return 0;
18157}
18158
18160 // Consider splitting large offset of struct or array.
18161 return true;
18162}
18163
18165 const MachineFunction &MF, EVT VT) const {
18166 EVT ScalarVT = VT.getScalarType();
18167
18168 if (!ScalarVT.isSimple())
18169 return false;
18170
18171 switch (ScalarVT.getSimpleVT().SimpleTy) {
18172 case MVT::f16:
18173 return Subtarget->hasFullFP16();
18174 case MVT::f32:
18175 case MVT::f64:
18176 return true;
18177 case MVT::bf16:
18178 return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18180 default:
18181 break;
18182 }
18183
18184 return false;
18185}
18186
18188 Type *Ty) const {
18189 switch (Ty->getScalarType()->getTypeID()) {
18190 case Type::FloatTyID:
18191 case Type::DoubleTyID:
18192 return true;
18193 default:
18194 return false;
18195 }
18196}
18197
18199 EVT VT, CodeGenOptLevel OptLevel) const {
18200 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18202}
18203
18204const MCPhysReg *
18206 // LR is a callee-save register, but we must treat it as clobbered by any call
18207 // site. Hence we include LR in the scratch registers, which are in turn added
18208 // as implicit-defs for stackmaps and patchpoints.
18209 static const MCPhysReg ScratchRegs[] = {
18210 AArch64::X16, AArch64::X17, AArch64::LR, 0
18211 };
18212 return ScratchRegs;
18213}
18214
18216 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18217 return RCRegs;
18218}
18219
18220bool
18222 CombineLevel Level) const {
18223 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18224 N->getOpcode() == ISD::SRL) &&
18225 "Expected shift op");
18226
18227 SDValue ShiftLHS = N->getOperand(0);
18228 EVT VT = N->getValueType(0);
18229
18230 if (!ShiftLHS->hasOneUse())
18231 return false;
18232
18233 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18234 !ShiftLHS.getOperand(0)->hasOneUse())
18235 return false;
18236
18237 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18238 // combine it with shift 'N' to let it be lowered to UBFX except:
18239 // ((x >> C) & mask) << C.
18240 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18241 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18242 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18243 if (isMask_64(TruncMask)) {
18244 SDValue AndLHS = ShiftLHS.getOperand(0);
18245 if (AndLHS.getOpcode() == ISD::SRL) {
18246 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18247 if (N->getOpcode() == ISD::SHL)
18248 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18249 return SRLC->getZExtValue() == SHLC->getZExtValue();
18250 return false;
18251 }
18252 }
18253 }
18254 }
18255 return true;
18256}
18257
18259 const SDNode *N) const {
18260 assert(N->getOpcode() == ISD::XOR &&
18261 (N->getOperand(0).getOpcode() == ISD::SHL ||
18262 N->getOperand(0).getOpcode() == ISD::SRL) &&
18263 "Expected XOR(SHIFT) pattern");
18264
18265 // Only commute if the entire NOT mask is a hidden shifted mask.
18266 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18267 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18268 if (XorC && ShiftC) {
18269 unsigned MaskIdx, MaskLen;
18270 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18271 unsigned ShiftAmt = ShiftC->getZExtValue();
18272 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18273 if (N->getOperand(0).getOpcode() == ISD::SHL)
18274 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18275 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18276 }
18277 }
18278
18279 return false;
18280}
18281
18283 const SDNode *N, CombineLevel Level) const {
18284 assert(((N->getOpcode() == ISD::SHL &&
18285 N->getOperand(0).getOpcode() == ISD::SRL) ||
18286 (N->getOpcode() == ISD::SRL &&
18287 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18288 "Expected shift-shift mask");
18289 // Don't allow multiuse shift folding with the same shift amount.
18290 if (!N->getOperand(0)->hasOneUse())
18291 return false;
18292
18293 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18294 EVT VT = N->getValueType(0);
18295 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18296 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18297 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18298 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18299 }
18300
18301 // We do not need to fold when this shifting used in specific load case:
18302 // (ldr x, (add x, (shl (srl x, c1) 2)))
18303 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18304 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18305 unsigned ShlAmt = C2->getZExtValue();
18306 if (auto ShouldADD = *N->user_begin();
18307 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18308 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18309 EVT MemVT = Load->getMemoryVT();
18310
18311 if (Load->getValueType(0).isScalableVector())
18312 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18313
18314 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18315 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18316 }
18317 }
18318 }
18319 }
18320
18321 return true;
18322}
18323
18325 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18326 SDValue Y) const {
18327 return VT.isScalableVector() && isTypeLegal(VT) &&
18328 SelectOpcode == ISD::VSELECT;
18329}
18330
18332 Type *Ty) const {
18333 assert(Ty->isIntegerTy());
18334
18335 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18336 if (BitSize == 0)
18337 return false;
18338
18339 int64_t Val = Imm.getSExtValue();
18340 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18341 return true;
18342
18343 if (Val < 0)
18344 Val = ~Val;
18345 if (BitSize == 32)
18346 Val &= (1LL << 32) - 1;
18347
18348 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18349 // MOVZ is free so return true for one or fewer MOVK.
18350 return Shift < 3;
18351}
18352
18354 unsigned Index) const {
18356 return false;
18357
18358 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18359}
18360
18361/// Turn vector tests of the signbit in the form of:
18362/// xor (sra X, elt_size(X)-1), -1
18363/// into:
18364/// cmge X, X, #0
18366 const AArch64Subtarget *Subtarget) {
18367 EVT VT = N->getValueType(0);
18368 if (!Subtarget->hasNEON() || !VT.isVector())
18369 return SDValue();
18370
18371 // There must be a shift right algebraic before the xor, and the xor must be a
18372 // 'not' operation.
18373 SDValue Shift = N->getOperand(0);
18374 SDValue Ones = N->getOperand(1);
18375 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18377 return SDValue();
18378
18379 // The shift should be smearing the sign bit across each vector element.
18380 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18381 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18382 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18383 return SDValue();
18384
18385 SDLoc DL(N);
18386 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18387 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18388}
18389
18390// Given a vecreduce_add node, detect the below pattern and convert it to the
18391// node sequence with UABDL, [S|U]ADB and UADDLP.
18392//
18393// i32 vecreduce_add(
18394// v16i32 abs(
18395// v16i32 sub(
18396// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18397//
18398// or
18399//
18400// i32 vecreduce_add(
18401// v16i32 zext(
18402// v16i16 abs(
18403// v16i16 sub(
18404// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18405//
18406// =================>
18407// i32 vecreduce_add(
18408// v4i32 UADDLP(
18409// v8i16 add(
18410// v8i16 zext(
18411// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18412// v8i16 zext(
18413// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18415 SelectionDAG &DAG) {
18416 // Assumed i32 vecreduce_add
18417 if (N->getValueType(0) != MVT::i32)
18418 return SDValue();
18419
18420 SDValue VecReduceOp0 = N->getOperand(0);
18421 bool SawTrailingZext = false;
18422 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18423 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18424 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18425 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18426 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18427 SawTrailingZext = true;
18428 VecReduceOp0 = VecReduceOp0.getOperand(0);
18429 }
18430
18431 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18432 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18433 // Assumed v16i16 or v16i32 abs input
18434 unsigned Opcode = VecReduceOp0.getOpcode();
18435 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18436 return SDValue();
18437
18438 SDValue ABS = VecReduceOp0;
18439 // Assumed v16i16 or v16i32 sub
18440 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18441 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18442 return SDValue();
18443
18444 SDValue SUB = ABS->getOperand(0);
18445 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18446 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18447 // Assumed v16i16 or v16i32 type
18448 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18449 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18450 return SDValue();
18451
18452 // Assumed zext or sext
18453 bool IsZExt = false;
18454 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18455 IsZExt = true;
18456 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18457 IsZExt = false;
18458 } else
18459 return SDValue();
18460
18461 SDValue EXT0 = SUB->getOperand(0);
18462 SDValue EXT1 = SUB->getOperand(1);
18463 // Assumed zext's operand has v16i8 type
18464 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18465 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18466 return SDValue();
18467
18468 // Pattern is detected. Let's convert it to sequence of nodes.
18469 SDLoc DL(N);
18470
18471 // First, create the node pattern of UABD/SABD.
18472 SDValue UABDHigh8Op0 =
18473 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18474 DAG.getConstant(8, DL, MVT::i64));
18475 SDValue UABDHigh8Op1 =
18476 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18477 DAG.getConstant(8, DL, MVT::i64));
18478 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18479 UABDHigh8Op0, UABDHigh8Op1);
18480 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18481
18482 // Second, create the node pattern of UABAL.
18483 SDValue UABDLo8Op0 =
18484 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18485 DAG.getConstant(0, DL, MVT::i64));
18486 SDValue UABDLo8Op1 =
18487 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18488 DAG.getConstant(0, DL, MVT::i64));
18489 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18490 UABDLo8Op0, UABDLo8Op1);
18491 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18492 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18493
18494 // Third, create the node of UADDLP.
18495 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18496
18497 // Fourth, create the node of VECREDUCE_ADD.
18498 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18499}
18500
18501static SDValue
18503 const AArch64Subtarget *ST) {
18504 if (DCI.isBeforeLegalize())
18505 return SDValue();
18506
18507 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18508 /*IsEqual=*/false))
18509 return While;
18510
18511 if (!N->getValueType(0).isScalableVector() ||
18512 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18513 return SDValue();
18514
18515 unsigned NumUses = N->use_size();
18516 auto MaskEC = N->getValueType(0).getVectorElementCount();
18517 if (!MaskEC.isKnownMultipleOf(NumUses))
18518 return SDValue();
18519
18520 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumUses);
18521 if (ExtMinEC.getKnownMinValue() < 2)
18522 return SDValue();
18523
18524 SmallVector<SDNode *> Extracts(NumUses, nullptr);
18525 for (SDNode *Use : N->users()) {
18526 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18527 return SDValue();
18528
18529 // Ensure the extract type is correct (e.g. if NumUses is 4 and
18530 // the mask return type is nxv8i1, each extract should be nxv2i1.
18531 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18532 return SDValue();
18533
18534 // There should be exactly one extract for each part of the mask.
18535 unsigned Offset = Use->getConstantOperandVal(1);
18536 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18537 if (Extracts[Part] != nullptr)
18538 return SDValue();
18539
18540 Extracts[Part] = Use;
18541 }
18542
18543 SelectionDAG &DAG = DCI.DAG;
18544 SDLoc DL(N);
18545 SDValue ID =
18546 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18547
18548 SDValue Idx = N->getOperand(0);
18549 SDValue TC = N->getOperand(1);
18550 EVT OpVT = Idx.getValueType();
18551 if (OpVT != MVT::i64) {
18552 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18553 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18554 }
18555
18556 // Create the whilelo_x2 intrinsics from each pair of extracts
18557 EVT ExtVT = Extracts[0]->getValueType(0);
18558 auto R =
18559 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18560 DCI.CombineTo(Extracts[0], R.getValue(0));
18561 DCI.CombineTo(Extracts[1], R.getValue(1));
18562
18563 if (NumUses == 2)
18564 return SDValue(N, 0);
18565
18566 auto Elts = DAG.getElementCount(DL, OpVT, ExtVT.getVectorElementCount() * 2);
18567 for (unsigned I = 2; I < NumUses; I += 2) {
18568 // After the first whilelo_x2, we need to increment the starting value.
18569 Idx = DAG.getNode(ISD::UADDSAT, DL, OpVT, Idx, Elts);
18570 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18571 DCI.CombineTo(Extracts[I], R.getValue(0));
18572 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18573 }
18574
18575 return SDValue(N, 0);
18576}
18577
18578// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18579// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18580// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18581// If we have vectors larger than v16i8 we extract v16i8 vectors,
18582// Follow the same steps above to get DOT instructions concatenate them
18583// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18585 const AArch64Subtarget *ST) {
18586 if (!ST->isNeonAvailable())
18587 return SDValue();
18588
18589 if (!ST->hasDotProd())
18591
18592 SDValue Op0 = N->getOperand(0);
18593 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18594 Op0.getValueType().getVectorElementType() != MVT::i32)
18595 return SDValue();
18596
18597 unsigned ExtOpcode = Op0.getOpcode();
18598 SDValue A = Op0;
18599 SDValue B;
18600 unsigned DotOpcode;
18601 if (ExtOpcode == ISD::MUL) {
18602 A = Op0.getOperand(0);
18603 B = Op0.getOperand(1);
18604 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18605 return SDValue();
18606 auto OpCodeA = A.getOpcode();
18607 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18608 return SDValue();
18609
18610 auto OpCodeB = B.getOpcode();
18611 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18612 return SDValue();
18613
18614 if (OpCodeA == OpCodeB) {
18615 DotOpcode =
18616 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18617 } else {
18618 // Check USDOT support support
18619 if (!ST->hasMatMulInt8())
18620 return SDValue();
18621 DotOpcode = AArch64ISD::USDOT;
18622 if (OpCodeA == ISD::SIGN_EXTEND)
18623 std::swap(A, B);
18624 }
18625 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18626 DotOpcode = AArch64ISD::UDOT;
18627 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18628 DotOpcode = AArch64ISD::SDOT;
18629 } else {
18630 return SDValue();
18631 }
18632
18633 EVT Op0VT = A.getOperand(0).getValueType();
18634 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18635 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18636 if (!IsValidElementCount || !IsValidSize)
18637 return SDValue();
18638
18639 SDLoc DL(Op0);
18640 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18641 // the extend B.
18642 if (!B)
18643 B = DAG.getConstant(1, DL, Op0VT);
18644 else
18645 B = B.getOperand(0);
18646
18647 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18648 unsigned NumOfVecReduce;
18649 EVT TargetType;
18650 if (IsMultipleOf16) {
18651 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18652 TargetType = MVT::v4i32;
18653 } else {
18654 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18655 TargetType = MVT::v2i32;
18656 }
18657 // Handle the case where we need to generate only one Dot operation.
18658 if (NumOfVecReduce == 1) {
18659 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18660 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18661 A.getOperand(0), B);
18662 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18663 }
18664 // Generate Dot instructions that are multiple of 16.
18665 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18666 SmallVector<SDValue, 4> SDotVec16;
18667 unsigned I = 0;
18668 for (; I < VecReduce16Num; I += 1) {
18669 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18670 SDValue Op0 =
18671 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18672 DAG.getConstant(I * 16, DL, MVT::i64));
18673 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18674 DAG.getConstant(I * 16, DL, MVT::i64));
18675 SDValue Dot =
18676 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18677 SDotVec16.push_back(Dot);
18678 }
18679 // Concatenate dot operations.
18680 EVT SDot16EVT =
18681 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18682 SDValue ConcatSDot16 =
18683 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18684 SDValue VecReduceAdd16 =
18685 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18686 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18687 if (VecReduce8Num == 0)
18688 return VecReduceAdd16;
18689
18690 // Generate the remainder Dot operation that is multiple of 8.
18691 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18692 SDValue Vec8Op0 =
18693 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18694 DAG.getConstant(I * 16, DL, MVT::i64));
18695 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18696 DAG.getConstant(I * 16, DL, MVT::i64));
18697 SDValue Dot =
18698 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18699 SDValue VecReduceAdd8 =
18700 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18701 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18702 VecReduceAdd8);
18703}
18704
18705// Given an (integer) vecreduce, we know the order of the inputs does not
18706// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18707// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18708// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18710 auto DetectAddExtract = [&](SDValue A) {
18711 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18712 // UADDLP(x) if found.
18713 assert(A.getOpcode() == ISD::ADD);
18714 EVT VT = A.getValueType();
18715 SDValue Op0 = A.getOperand(0);
18716 SDValue Op1 = A.getOperand(1);
18717 if (Op0.getOpcode() != Op1.getOpcode() ||
18718 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18719 Op0.getOpcode() != ISD::SIGN_EXTEND))
18720 return SDValue();
18721 SDValue Ext0 = Op0.getOperand(0);
18722 SDValue Ext1 = Op1.getOperand(0);
18723 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18725 Ext0.getOperand(0) != Ext1.getOperand(0))
18726 return SDValue();
18727 // Check that the type is twice the add types, and the extract are from
18728 // upper/lower parts of the same source.
18730 VT.getVectorNumElements() * 2)
18731 return SDValue();
18732 if ((Ext0.getConstantOperandVal(1) != 0 ||
18734 (Ext1.getConstantOperandVal(1) != 0 ||
18736 return SDValue();
18737 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18738 : AArch64ISD::SADDLP;
18739 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18740 };
18741
18742 if (SDValue R = DetectAddExtract(A))
18743 return R;
18744
18745 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18746 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18747 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18748 A.getOperand(1));
18749 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18750 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18751 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18752 A.getOperand(0));
18753 return SDValue();
18754}
18755
18756// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18757// UADDLV(concat), where the concat represents the 64-bit zext sources.
18759 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18760 // UADDLV(concat(zext, zext)) if found.
18761 assert(A.getOpcode() == ISD::ADD);
18762 EVT VT = A.getValueType();
18763 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18764 return SDValue();
18765 SDValue Op0 = A.getOperand(0);
18766 SDValue Op1 = A.getOperand(1);
18767 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18768 return SDValue();
18769 SDValue Ext0 = Op0.getOperand(0);
18770 SDValue Ext1 = Op1.getOperand(0);
18771 EVT ExtVT0 = Ext0.getValueType();
18772 EVT ExtVT1 = Ext1.getValueType();
18773 // Check zext VTs are the same and 64-bit length.
18774 if (ExtVT0 != ExtVT1 ||
18775 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18776 return SDValue();
18777 // Get VT for concat of zext sources.
18778 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18779 SDValue Concat =
18780 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18781
18782 switch (VT.getSimpleVT().SimpleTy) {
18783 case MVT::v2i64:
18784 case MVT::v4i32:
18785 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18786 case MVT::v8i16: {
18787 SDValue Uaddlv =
18788 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18789 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18790 }
18791 default:
18792 llvm_unreachable("Unhandled vector type");
18793 }
18794}
18795
18797 SDValue A = N->getOperand(0);
18798 if (A.getOpcode() == ISD::ADD) {
18799 if (SDValue R = performUADDVAddCombine(A, DAG))
18800 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18801 else if (SDValue R = performUADDVZextCombine(A, DAG))
18802 return R;
18803 }
18804 return SDValue();
18805}
18806
18809 const AArch64Subtarget *Subtarget) {
18810 if (DCI.isBeforeLegalizeOps())
18811 return SDValue();
18812
18813 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18814}
18815
18816SDValue
18817AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18818 SelectionDAG &DAG,
18819 SmallVectorImpl<SDNode *> &Created) const {
18821 if (isIntDivCheap(N->getValueType(0), Attr))
18822 return SDValue(N, 0); // Lower SDIV as SDIV
18823
18824 EVT VT = N->getValueType(0);
18825
18826 // If SVE is available, we can generate
18827 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
18828 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
18829 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
18830 return SDValue(N, 0);
18831
18832 // fold (sdiv X, pow2)
18833 if ((VT != MVT::i32 && VT != MVT::i64) ||
18834 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18835 return SDValue();
18836
18837 // If the divisor is 2 or -2, the default expansion is better. It will add
18838 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18839 if (Divisor == 2 ||
18840 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18841 return SDValue();
18842
18843 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18844}
18845
18846SDValue
18847AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18848 SelectionDAG &DAG,
18849 SmallVectorImpl<SDNode *> &Created) const {
18851 if (isIntDivCheap(N->getValueType(0), Attr))
18852 return SDValue(N, 0); // Lower SREM as SREM
18853
18854 EVT VT = N->getValueType(0);
18855
18856 // For scalable and fixed types, mark them as cheap so we can handle it much
18857 // later. This allows us to handle larger than legal types.
18858 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18859 return SDValue(N, 0);
18860
18861 // fold (srem X, pow2)
18862 if ((VT != MVT::i32 && VT != MVT::i64) ||
18863 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18864 return SDValue();
18865
18866 unsigned Lg2 = Divisor.countr_zero();
18867 if (Lg2 == 0)
18868 return SDValue();
18869
18870 SDLoc DL(N);
18871 SDValue N0 = N->getOperand(0);
18872 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18873 SDValue Zero = DAG.getConstant(0, DL, VT);
18874 SDValue CCVal, CSNeg;
18875 if (Lg2 == 1) {
18876 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
18877 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18878 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
18879
18880 Created.push_back(Cmp.getNode());
18881 Created.push_back(And.getNode());
18882 } else {
18883 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
18884 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
18885
18886 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18887 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18888 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18889 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18890 Negs.getValue(1));
18891
18892 Created.push_back(Negs.getNode());
18893 Created.push_back(AndPos.getNode());
18894 Created.push_back(AndNeg.getNode());
18895 }
18896
18897 return CSNeg;
18898}
18899
18900static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18901 switch(getIntrinsicID(S.getNode())) {
18902 default:
18903 break;
18904 case Intrinsic::aarch64_sve_cntb:
18905 return 8;
18906 case Intrinsic::aarch64_sve_cnth:
18907 return 16;
18908 case Intrinsic::aarch64_sve_cntw:
18909 return 32;
18910 case Intrinsic::aarch64_sve_cntd:
18911 return 64;
18912 }
18913 return {};
18914}
18915
18916/// Calculates what the pre-extend type is, based on the extension
18917/// operation node provided by \p Extend.
18918///
18919/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18920/// pre-extend type is pulled directly from the operand, while other extend
18921/// operations need a bit more inspection to get this information.
18922///
18923/// \param Extend The SDNode from the DAG that represents the extend operation
18924///
18925/// \returns The type representing the \p Extend source type, or \p MVT::Other
18926/// if no valid type can be determined
18928 switch (Extend.getOpcode()) {
18929 case ISD::SIGN_EXTEND:
18930 case ISD::ZERO_EXTEND:
18931 case ISD::ANY_EXTEND:
18932 return Extend.getOperand(0).getValueType();
18933 case ISD::AssertSext:
18934 case ISD::AssertZext:
18936 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18937 if (!TypeNode)
18938 return MVT::Other;
18939 return TypeNode->getVT();
18940 }
18941 case ISD::AND: {
18943 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18944 if (!Constant)
18945 return MVT::Other;
18946
18947 uint32_t Mask = Constant->getZExtValue();
18948
18949 if (Mask == UCHAR_MAX)
18950 return MVT::i8;
18951 else if (Mask == USHRT_MAX)
18952 return MVT::i16;
18953 else if (Mask == UINT_MAX)
18954 return MVT::i32;
18955
18956 return MVT::Other;
18957 }
18958 default:
18959 return MVT::Other;
18960 }
18961}
18962
18963/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18964/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18965/// SExt/ZExt rather than the scalar SExt/ZExt
18967 EVT VT = BV.getValueType();
18968 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18970 return SDValue();
18971
18972 // Use the first item in the buildvector/shuffle to get the size of the
18973 // extend, and make sure it looks valid.
18974 SDValue Extend = BV->getOperand(0);
18975 unsigned ExtendOpcode = Extend.getOpcode();
18976 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18977 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18978 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18979 ExtendOpcode == ISD::AssertSext;
18980 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18981 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18982 return SDValue();
18983 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18984 // ensure calculatePreExtendType will work without issue.
18985 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18986 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18987 return SDValue();
18988
18989 // Restrict valid pre-extend data type
18990 EVT PreExtendType = calculatePreExtendType(Extend);
18991 if (PreExtendType == MVT::Other ||
18992 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18993 return SDValue();
18994
18995 // Make sure all other operands are equally extended.
18996 bool SeenZExtOrSExt = !IsAnyExt;
18997 for (SDValue Op : drop_begin(BV->ops())) {
18998 if (Op.isUndef())
18999 continue;
19000
19001 if (calculatePreExtendType(Op) != PreExtendType)
19002 return SDValue();
19003
19004 unsigned Opc = Op.getOpcode();
19005 if (Opc == ISD::ANY_EXTEND)
19006 continue;
19007
19008 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19010
19011 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19012 return SDValue();
19013
19014 IsSExt = OpcIsSExt;
19015 SeenZExtOrSExt = true;
19016 }
19017
19018 SDValue NBV;
19019 SDLoc DL(BV);
19020 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19021 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19022 EVT PreExtendLegalType =
19023 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19025 for (SDValue Op : BV->ops())
19026 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19027 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19028 PreExtendLegalType));
19029 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19030 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19031 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19032 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19033 BV.getOperand(1).isUndef()
19034 ? DAG.getUNDEF(PreExtendVT)
19035 : BV.getOperand(1).getOperand(0),
19036 cast<ShuffleVectorSDNode>(BV)->getMask());
19037 }
19038 unsigned ExtOpc = !SeenZExtOrSExt
19040 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19041 return DAG.getNode(ExtOpc, DL, VT, NBV);
19042}
19043
19044/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19045/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19047 // If the value type isn't a vector, none of the operands are going to be dups
19048 EVT VT = Mul->getValueType(0);
19049 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19050 return SDValue();
19051
19052 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19053 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19054
19055 // Neither operands have been changed, don't make any further changes
19056 if (!Op0 && !Op1)
19057 return SDValue();
19058
19059 SDLoc DL(Mul);
19060 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19061 Op1 ? Op1 : Mul->getOperand(1));
19062}
19063
19064// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19065// Same for other types with equivalent constants.
19067 EVT VT = N->getValueType(0);
19068 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19069 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19070 return SDValue();
19071 if (N->getOperand(0).getOpcode() != ISD::AND ||
19072 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19073 return SDValue();
19074
19075 SDValue And = N->getOperand(0);
19076 SDValue Srl = And.getOperand(0);
19077
19078 APInt V1, V2, V3;
19079 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19080 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19082 return SDValue();
19083
19084 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19085 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19086 V3 != (HalfSize - 1))
19087 return SDValue();
19088
19089 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19090 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19091 VT.getVectorElementCount() * 2);
19092
19093 SDLoc DL(N);
19094 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19095 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19096 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19097 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19098}
19099
19100// Transform vector add(zext i8 to i32, zext i8 to i32)
19101// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19102// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19103// extends.
19105 EVT VT = N->getValueType(0);
19106 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19107 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19108 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19109 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19110 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19111 N->getOperand(0).getOperand(0).getValueType() !=
19112 N->getOperand(1).getOperand(0).getValueType())
19113 return SDValue();
19114
19115 if (N->getOpcode() == ISD::MUL &&
19116 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19117 return SDValue();
19118
19119 SDValue N0 = N->getOperand(0).getOperand(0);
19120 SDValue N1 = N->getOperand(1).getOperand(0);
19121 EVT InVT = N0.getValueType();
19122
19123 EVT S1 = InVT.getScalarType();
19124 EVT S2 = VT.getScalarType();
19125 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19126 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19127 SDLoc DL(N);
19128 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19131 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19132 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19133 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19134 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19135 : (unsigned)ISD::SIGN_EXTEND,
19136 DL, VT, NewOp);
19137 }
19138 return SDValue();
19139}
19140
19143 const AArch64Subtarget *Subtarget) {
19144
19145 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19146 return Ext;
19148 return Ext;
19149 if (SDValue Ext = performVectorExtCombine(N, DAG))
19150 return Ext;
19151
19152 if (DCI.isBeforeLegalizeOps())
19153 return SDValue();
19154
19155 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19156 // and in MachineCombiner pass, add+mul will be combined into madd.
19157 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19158 SDLoc DL(N);
19159 EVT VT = N->getValueType(0);
19160 SDValue N0 = N->getOperand(0);
19161 SDValue N1 = N->getOperand(1);
19162 SDValue MulOper;
19163 unsigned AddSubOpc;
19164
19165 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19166 AddSubOpc = V->getOpcode();
19167 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19168 SDValue Opnd = V->getOperand(1);
19169 MulOper = V->getOperand(0);
19170 if (AddSubOpc == ISD::SUB)
19171 std::swap(Opnd, MulOper);
19172 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19173 return C->isOne();
19174 }
19175 return false;
19176 };
19177
19178 if (IsAddSubWith1(N0)) {
19179 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19180 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19181 }
19182
19183 if (IsAddSubWith1(N1)) {
19184 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19185 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19186 }
19187
19188 // The below optimizations require a constant RHS.
19189 if (!isa<ConstantSDNode>(N1))
19190 return SDValue();
19191
19192 ConstantSDNode *C = cast<ConstantSDNode>(N1);
19193 const APInt &ConstValue = C->getAPIntValue();
19194
19195 // Allow the scaling to be folded into the `cnt` instruction by preventing
19196 // the scaling to be obscured here. This makes it easier to pattern match.
19197 if (IsSVECntIntrinsic(N0) ||
19198 (N0->getOpcode() == ISD::TRUNCATE &&
19199 (IsSVECntIntrinsic(N0->getOperand(0)))))
19200 if (ConstValue.sge(1) && ConstValue.sle(16))
19201 return SDValue();
19202
19203 // Multiplication of a power of two plus/minus one can be done more
19204 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19205 // future CPUs have a cheaper MADD instruction, this may need to be
19206 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19207 // 64-bit is 5 cycles, so this is always a win.
19208 // More aggressively, some multiplications N0 * C can be lowered to
19209 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19210 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19211 // TODO: lower more cases.
19212
19213 // TrailingZeroes is used to test if the mul can be lowered to
19214 // shift+add+shift.
19215 unsigned TrailingZeroes = ConstValue.countr_zero();
19216 if (TrailingZeroes) {
19217 // Conservatively do not lower to shift+add+shift if the mul might be
19218 // folded into smul or umul.
19219 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19220 isZeroExtended(N0, DAG)))
19221 return SDValue();
19222 // Conservatively do not lower to shift+add+shift if the mul might be
19223 // folded into madd or msub.
19224 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19225 N->user_begin()->getOpcode() == ISD::SUB))
19226 return SDValue();
19227 }
19228 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19229 // and shift+add+shift.
19230 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19231 unsigned ShiftAmt;
19232
19233 auto Shl = [&](SDValue N0, unsigned N1) {
19234 if (!N0.getNode())
19235 return SDValue();
19236 // If shift causes overflow, ignore this combine.
19237 if (N1 >= N0.getValueSizeInBits())
19238 return SDValue();
19239 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19240 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19241 };
19242 auto Add = [&](SDValue N0, SDValue N1) {
19243 if (!N0.getNode() || !N1.getNode())
19244 return SDValue();
19245 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19246 };
19247 auto Sub = [&](SDValue N0, SDValue N1) {
19248 if (!N0.getNode() || !N1.getNode())
19249 return SDValue();
19250 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19251 };
19252 auto Negate = [&](SDValue N) {
19253 if (!N0.getNode())
19254 return SDValue();
19255 SDValue Zero = DAG.getConstant(0, DL, VT);
19256 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19257 };
19258
19259 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19260 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19261 // the (2^N - 1) can't be execused via a single instruction.
19262 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19263 unsigned BitWidth = C.getBitWidth();
19264 for (unsigned i = 1; i < BitWidth / 2; i++) {
19265 APInt Rem;
19266 APInt X(BitWidth, (1 << i) + 1);
19267 APInt::sdivrem(C, X, N, Rem);
19268 APInt NVMinus1 = N - 1;
19269 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19270 M = X;
19271 return true;
19272 }
19273 }
19274 return false;
19275 };
19276
19277 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19278 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19279 // the (2^N - 1) can't be execused via a single instruction.
19280 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19281 APInt CVMinus1 = C - 1;
19282 if (CVMinus1.isNegative())
19283 return false;
19284 unsigned TrailingZeroes = CVMinus1.countr_zero();
19285 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19286 if (SCVMinus1.isPowerOf2()) {
19287 unsigned BitWidth = SCVMinus1.getBitWidth();
19288 M = APInt(BitWidth, SCVMinus1.logBase2());
19289 N = APInt(BitWidth, TrailingZeroes);
19290 return true;
19291 }
19292 return false;
19293 };
19294
19295 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19296 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19297 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19298 APInt CVMinus1 = C - 1;
19299 if (CVMinus1.isNegative())
19300 return false;
19301 unsigned TrailingZeroes = CVMinus1.countr_zero();
19302 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19303 if (CVPlus1.isPowerOf2()) {
19304 unsigned BitWidth = CVPlus1.getBitWidth();
19305 M = APInt(BitWidth, CVPlus1.logBase2());
19306 N = APInt(BitWidth, TrailingZeroes);
19307 return true;
19308 }
19309 return false;
19310 };
19311
19312 if (ConstValue.isNonNegative()) {
19313 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19314 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19315 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19316 // (mul x, (2^M + 1) * (2^N + 1))
19317 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19318 // (mul x, (2^M + 1) * 2^N + 1))
19319 // => MV = add (shl x, M), x); add (shl MV, N), x)
19320 // (mul x, 1 - (1 - 2^M) * 2^N))
19321 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19322 APInt SCVMinus1 = ShiftedConstValue - 1;
19323 APInt SCVPlus1 = ShiftedConstValue + 1;
19324 APInt CVPlus1 = ConstValue + 1;
19325 APInt CVM, CVN;
19326 if (SCVMinus1.isPowerOf2()) {
19327 ShiftAmt = SCVMinus1.logBase2();
19328 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19329 } else if (CVPlus1.isPowerOf2()) {
19330 ShiftAmt = CVPlus1.logBase2();
19331 return Sub(Shl(N0, ShiftAmt), N0);
19332 } else if (SCVPlus1.isPowerOf2()) {
19333 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19334 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19335 }
19336 if (Subtarget->hasALULSLFast() &&
19337 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19338 APInt CVMMinus1 = CVM - 1;
19339 APInt CVNMinus1 = CVN - 1;
19340 unsigned ShiftM1 = CVMMinus1.logBase2();
19341 unsigned ShiftN1 = CVNMinus1.logBase2();
19342 // ALULSLFast implicate that Shifts <= 4 places are fast
19343 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19344 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19345 return Add(Shl(MVal, ShiftN1), MVal);
19346 }
19347 }
19348 if (Subtarget->hasALULSLFast() &&
19349 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19350 unsigned ShiftM = CVM.getZExtValue();
19351 unsigned ShiftN = CVN.getZExtValue();
19352 // ALULSLFast implicate that Shifts <= 4 places are fast
19353 if (ShiftM <= 4 && ShiftN <= 4) {
19354 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19355 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19356 }
19357 }
19358
19359 if (Subtarget->hasALULSLFast() &&
19360 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19361 unsigned ShiftM = CVM.getZExtValue();
19362 unsigned ShiftN = CVN.getZExtValue();
19363 // ALULSLFast implicate that Shifts <= 4 places are fast
19364 if (ShiftM <= 4 && ShiftN <= 4) {
19365 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19366 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19367 }
19368 }
19369 } else {
19370 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19371 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19372 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19373 APInt SCVPlus1 = -ShiftedConstValue + 1;
19374 APInt CVNegPlus1 = -ConstValue + 1;
19375 APInt CVNegMinus1 = -ConstValue - 1;
19376 if (CVNegPlus1.isPowerOf2()) {
19377 ShiftAmt = CVNegPlus1.logBase2();
19378 return Sub(N0, Shl(N0, ShiftAmt));
19379 } else if (CVNegMinus1.isPowerOf2()) {
19380 ShiftAmt = CVNegMinus1.logBase2();
19381 return Negate(Add(Shl(N0, ShiftAmt), N0));
19382 } else if (SCVPlus1.isPowerOf2()) {
19383 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19384 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19385 }
19386 }
19387
19388 return SDValue();
19389}
19390
19392 SelectionDAG &DAG) {
19393 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19394 // optimize away operation when it's from a constant.
19395 //
19396 // The general transformation is:
19397 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19398 // AND(VECTOR_CMP(x,y), constant2)
19399 // constant2 = UNARYOP(constant)
19400
19401 // Early exit if this isn't a vector operation, the operand of the
19402 // unary operation isn't a bitwise AND, or if the sizes of the operations
19403 // aren't the same.
19404 EVT VT = N->getValueType(0);
19405 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19406 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19407 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19408 return SDValue();
19409
19410 // Now check that the other operand of the AND is a constant. We could
19411 // make the transformation for non-constant splats as well, but it's unclear
19412 // that would be a benefit as it would not eliminate any operations, just
19413 // perform one more step in scalar code before moving to the vector unit.
19414 if (BuildVectorSDNode *BV =
19415 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19416 // Bail out if the vector isn't a constant.
19417 if (!BV->isConstant())
19418 return SDValue();
19419
19420 // Everything checks out. Build up the new and improved node.
19421 SDLoc DL(N);
19422 EVT IntVT = BV->getValueType(0);
19423 // Create a new constant of the appropriate type for the transformed
19424 // DAG.
19425 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19426 // The AND node needs bitcasts to/from an integer vector type around it.
19427 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19428 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19429 N->getOperand(0)->getOperand(0), MaskConst);
19430 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19431 return Res;
19432 }
19433
19434 return SDValue();
19435}
19436
19437/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19438/// functions, this can help to reduce the number of fmovs to/from GPRs.
19439static SDValue
19442 const AArch64Subtarget *Subtarget) {
19443 if (N->isStrictFPOpcode())
19444 return SDValue();
19445
19446 if (DCI.isBeforeLegalizeOps())
19447 return SDValue();
19448
19449 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19450 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19451 return SDValue();
19452
19453 auto isSupportedType = [](EVT VT) {
19454 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19455 };
19456
19457 SDValue SrcVal = N->getOperand(0);
19458 EVT SrcTy = SrcVal.getValueType();
19459 EVT DestTy = N->getValueType(0);
19460
19461 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19462 return SDValue();
19463
19464 EVT SrcVecTy;
19465 EVT DestVecTy;
19466 if (DestTy.bitsGT(SrcTy)) {
19467 DestVecTy = getPackedSVEVectorVT(DestTy);
19468 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19469 } else {
19470 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19471 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19472 }
19473
19474 // Ensure the resulting src/dest vector type is legal.
19475 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19476 return SDValue();
19477
19478 SDLoc DL(N);
19479 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19480 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19481 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19482 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19483 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19484}
19485
19488 const AArch64Subtarget *Subtarget) {
19489 // First try to optimize away the conversion when it's conditionally from
19490 // a constant. Vectors only.
19492 return Res;
19493
19494 if (SDValue Res =
19495 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19496 return Res;
19497
19498 EVT VT = N->getValueType(0);
19499 if (VT != MVT::f32 && VT != MVT::f64)
19500 return SDValue();
19501
19502 // Only optimize when the source and destination types have the same width.
19503 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19504 return SDValue();
19505
19506 // If the result of an integer load is only used by an integer-to-float
19507 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19508 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19509 SDValue N0 = N->getOperand(0);
19510 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19511 N0.hasOneUse() &&
19512 // Do not change the width of a volatile load.
19513 !cast<LoadSDNode>(N0)->isVolatile()) {
19514 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19515 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19516 LN0->getPointerInfo(), LN0->getAlign(),
19517 LN0->getMemOperand()->getFlags());
19518
19519 // Make sure successors of the original load stay after it by updating them
19520 // to use the new Chain.
19521 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19522
19523 unsigned Opcode =
19524 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19525 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19526 }
19527
19528 return SDValue();
19529}
19530
19531/// Fold a floating-point multiply by power of two into floating-point to
19532/// fixed-point conversion.
19535 const AArch64Subtarget *Subtarget) {
19536 if (SDValue Res =
19537 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19538 return Res;
19539
19540 if (!Subtarget->isNeonAvailable())
19541 return SDValue();
19542
19543 if (!N->getValueType(0).isSimple())
19544 return SDValue();
19545
19546 SDValue Op = N->getOperand(0);
19547 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19548 return SDValue();
19549
19550 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19551 return SDValue();
19552
19553 SDValue ConstVec = Op->getOperand(1);
19554 if (!isa<BuildVectorSDNode>(ConstVec))
19555 return SDValue();
19556
19557 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19558 uint32_t FloatBits = FloatTy.getSizeInBits();
19559 if (FloatBits != 32 && FloatBits != 64 &&
19560 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19561 return SDValue();
19562
19563 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19564 uint32_t IntBits = IntTy.getSizeInBits();
19565 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19566 return SDValue();
19567
19568 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19569 if (IntBits > FloatBits)
19570 return SDValue();
19571
19572 BitVector UndefElements;
19573 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19574 int32_t Bits = IntBits == 64 ? 64 : 32;
19575 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19576 if (C == -1 || C == 0 || C > Bits)
19577 return SDValue();
19578
19579 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19580 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19581 return SDValue();
19582
19583 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19584 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19585 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19586 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19587 return SDValue();
19588 }
19589
19590 SDLoc DL(N);
19591 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19592 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19593 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19594 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19595 SDValue FixConv =
19597 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19598 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19599 // We can handle smaller integers by generating an extra trunc.
19600 if (IntBits < FloatBits)
19601 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19602
19603 return FixConv;
19604}
19605
19606// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19607// convert to csel(ccmp(.., cc0)), depending on cc1:
19608
19609// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19610// =>
19611// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19612//
19613// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19614// =>
19615// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19617 EVT VT = N->getValueType(0);
19618 SDValue CSel0 = N->getOperand(0);
19619 SDValue CSel1 = N->getOperand(1);
19620
19621 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19622 CSel1.getOpcode() != AArch64ISD::CSEL)
19623 return SDValue();
19624
19625 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19626 return SDValue();
19627
19628 if (!isNullConstant(CSel0.getOperand(0)) ||
19629 !isOneConstant(CSel0.getOperand(1)) ||
19630 !isNullConstant(CSel1.getOperand(0)) ||
19631 !isOneConstant(CSel1.getOperand(1)))
19632 return SDValue();
19633
19634 SDValue Cmp0 = CSel0.getOperand(3);
19635 SDValue Cmp1 = CSel1.getOperand(3);
19638 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19639 return SDValue();
19640 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19641 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19642 std::swap(Cmp0, Cmp1);
19643 std::swap(CC0, CC1);
19644 }
19645
19646 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19647 return SDValue();
19648
19649 SDLoc DL(N);
19650 SDValue CCmp, Condition;
19651 unsigned NZCV;
19652
19653 if (N->getOpcode() == ISD::AND) {
19655 Condition = getCondCode(DAG, InvCC0);
19657 } else {
19659 Condition = getCondCode(DAG, CC0);
19661 }
19662
19663 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19664
19665 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19666 if (Op1 && Op1->getAPIntValue().isNegative() &&
19667 Op1->getAPIntValue().sgt(-32)) {
19668 // CCMP accept the constant int the range [0, 31]
19669 // if the Op1 is a constant in the range [-31, -1], we
19670 // can select to CCMN to avoid the extra mov
19671 SDValue AbsOp1 =
19672 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19673 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
19674 AbsOp1, NZCVOp, Condition, Cmp0);
19675 } else {
19676 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
19677 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19678 }
19679 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19680 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
19681}
19682
19684 const AArch64Subtarget *Subtarget,
19685 const AArch64TargetLowering &TLI) {
19686 SelectionDAG &DAG = DCI.DAG;
19687
19688 if (SDValue R = performANDORCSELCombine(N, DAG))
19689 return R;
19690
19691 return SDValue();
19692}
19693
19695 if (!MemVT.getVectorElementType().isSimple())
19696 return false;
19697
19698 uint64_t MaskForTy = 0ull;
19699 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19700 case MVT::i8:
19701 MaskForTy = 0xffull;
19702 break;
19703 case MVT::i16:
19704 MaskForTy = 0xffffull;
19705 break;
19706 case MVT::i32:
19707 MaskForTy = 0xffffffffull;
19708 break;
19709 default:
19710 return false;
19711 break;
19712 }
19713
19714 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19715 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19716 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19717
19718 return false;
19719}
19720
19722 SDValue LeafOp = SDValue(N, 0);
19723 SDValue Op = N->getOperand(0);
19724 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19725 LeafOp.getValueType() != Op.getValueType())
19726 Op = Op->getOperand(0);
19727 if (LeafOp.getValueType() == Op.getValueType())
19728 return Op;
19729 return SDValue();
19730}
19731
19734 SelectionDAG &DAG = DCI.DAG;
19735 SDValue Src = N->getOperand(0);
19736 unsigned Opc = Src->getOpcode();
19737
19738 // Zero/any extend of an unsigned unpack
19739 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19740 SDValue UnpkOp = Src->getOperand(0);
19741 SDValue Dup = N->getOperand(1);
19742
19743 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19744 return SDValue();
19745
19746 SDLoc DL(N);
19747 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19748 if (!C)
19749 return SDValue();
19750
19751 uint64_t ExtVal = C->getZExtValue();
19752
19753 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19754 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19755 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19756 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19757 };
19758
19759 // If the mask is fully covered by the unpack, we don't need to push
19760 // a new AND onto the operand
19761 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19762 if (MaskAndTypeMatch(EltTy))
19763 return Src;
19764
19765 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19766 // to see if the mask is all-ones of size MemTy.
19767 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19768 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19769 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19770 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19771 if (MaskAndTypeMatch(EltTy))
19772 return Src;
19773 }
19774
19775 // Truncate to prevent a DUP with an over wide constant
19776 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19777
19778 // Otherwise, make sure we propagate the AND to the operand
19779 // of the unpack
19780 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19781 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19782
19783 SDValue And = DAG.getNode(ISD::AND, DL,
19784 UnpkOp->getValueType(0), UnpkOp, Dup);
19785
19786 return DAG.getNode(Opc, DL, N->getValueType(0), And);
19787 }
19788
19789 if (DCI.isBeforeLegalizeOps())
19790 return SDValue();
19791
19792 // If both sides of AND operations are i1 splat_vectors then
19793 // we can produce just i1 splat_vector as the result.
19794 if (isAllActivePredicate(DAG, N->getOperand(0)))
19795 return N->getOperand(1);
19796 if (isAllActivePredicate(DAG, N->getOperand(1)))
19797 return N->getOperand(0);
19798
19800 return SDValue();
19801
19802 SDValue Mask = N->getOperand(1);
19803
19804 if (!Src.hasOneUse())
19805 return SDValue();
19806
19807 EVT MemVT;
19808
19809 // SVE load instructions perform an implicit zero-extend, which makes them
19810 // perfect candidates for combining.
19811 switch (Opc) {
19812 case AArch64ISD::LD1_MERGE_ZERO:
19813 case AArch64ISD::LDNF1_MERGE_ZERO:
19814 case AArch64ISD::LDFF1_MERGE_ZERO:
19815 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19816 break;
19817 case AArch64ISD::GLD1_MERGE_ZERO:
19818 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19819 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19820 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19821 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19822 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19823 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19824 case AArch64ISD::GLDFF1_MERGE_ZERO:
19825 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
19826 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
19827 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
19828 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
19829 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
19830 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
19831 case AArch64ISD::GLDNT1_MERGE_ZERO:
19832 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19833 break;
19834 default:
19835 return SDValue();
19836 }
19837
19838 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19839 return Src;
19840
19841 return SDValue();
19842}
19843
19844// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19847
19848 // This function performs an optimization on a specific pattern involving
19849 // an AND operation and SETCC (Set Condition Code) node.
19850
19851 SDValue SetCC = N->getOperand(0);
19852 EVT VT = N->getValueType(0);
19853 SelectionDAG &DAG = DCI.DAG;
19854
19855 // Checks if the current node (N) is used by any SELECT instruction and
19856 // returns an empty SDValue to avoid applying the optimization to prevent
19857 // incorrect results
19858 for (auto U : N->users())
19859 if (U->getOpcode() == ISD::SELECT)
19860 return SDValue();
19861
19862 // Check if the operand is a SETCC node with floating-point comparison
19863 if (SetCC.getOpcode() == ISD::SETCC &&
19864 SetCC.getOperand(0).getValueType() == MVT::f32) {
19865
19866 SDValue Cmp;
19868
19869 // Check if the DAG is after legalization and if we can emit the conjunction
19870 if (!DCI.isBeforeLegalize() &&
19871 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19872
19874
19875 SDLoc DL(N);
19876 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19877 DAG.getConstant(0, DL, VT),
19878 getCondCode(DAG, InvertedCC), Cmp);
19879 }
19880 }
19881 return SDValue();
19882}
19883
19886 SelectionDAG &DAG = DCI.DAG;
19887 SDValue LHS = N->getOperand(0);
19888 SDValue RHS = N->getOperand(1);
19889 EVT VT = N->getValueType(0);
19890
19891 if (SDValue R = performANDORCSELCombine(N, DAG))
19892 return R;
19893
19894 if (SDValue R = performANDSETCCCombine(N,DCI))
19895 return R;
19896
19897 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19898 return SDValue();
19899
19900 if (VT.isScalableVector())
19901 return performSVEAndCombine(N, DCI);
19902
19903 // The combining code below works only for NEON vectors. In particular, it
19904 // does not work for SVE when dealing with vectors wider than 128 bits.
19905 if (!VT.is64BitVector() && !VT.is128BitVector())
19906 return SDValue();
19907
19908 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19909 if (!BVN)
19910 return SDValue();
19911
19912 // AND does not accept an immediate, so check if we can use a BIC immediate
19913 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19914 // pattern in isel, because some immediates may be lowered to the preferred
19915 // (and x, (movi imm)) form, even though an mvni representation also exists.
19916 APInt DefBits(VT.getSizeInBits(), 0);
19917 APInt UndefBits(VT.getSizeInBits(), 0);
19918 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19919 SDValue NewOp;
19920
19921 // Any bits known to already be 0 need not be cleared again, which can help
19922 // reduce the size of the immediate to one supported by the instruction.
19923 KnownBits Known = DAG.computeKnownBits(LHS);
19924 APInt ZeroSplat(VT.getSizeInBits(), 0);
19925 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19926 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19927 << (Known.Zero.getBitWidth() * I);
19928
19929 DefBits = ~(DefBits | ZeroSplat);
19930 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19931 DefBits, &LHS)) ||
19932 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19933 DefBits, &LHS)))
19934 return NewOp;
19935
19936 UndefBits = ~(UndefBits | ZeroSplat);
19937 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19938 UndefBits, &LHS)) ||
19939 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19940 UndefBits, &LHS)))
19941 return NewOp;
19942 }
19943
19944 return SDValue();
19945}
19946
19949 SelectionDAG &DAG = DCI.DAG;
19950 SDValue LHS = N->getOperand(0);
19951 SDValue RHS = N->getOperand(1);
19952 EVT VT = N->getValueType(0);
19953 SDLoc DL(N);
19954
19955 if (!N->getFlags().hasAllowReassociation())
19956 return SDValue();
19957
19958 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19959 auto ReassocComplex = [&](SDValue A, SDValue B) {
19960 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19961 return SDValue();
19962 unsigned Opc = A.getConstantOperandVal(0);
19963 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19964 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19965 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19966 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19967 return SDValue();
19968 SDValue VCMLA = DAG.getNode(
19969 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19970 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19971 A.getOperand(2), A.getOperand(3));
19972 VCMLA->setFlags(A->getFlags());
19973 return VCMLA;
19974 };
19975 if (SDValue R = ReassocComplex(LHS, RHS))
19976 return R;
19977 if (SDValue R = ReassocComplex(RHS, LHS))
19978 return R;
19979
19980 return SDValue();
19981}
19982
19983static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19984 switch (Opcode) {
19985 case ISD::STRICT_FADD:
19986 case ISD::FADD:
19987 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19988 case ISD::ADD:
19989 return VT == MVT::i64;
19990 default:
19991 return false;
19992 }
19993}
19994
19995static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19997
19999 if ((N.getOpcode() == ISD::SETCC) ||
20000 // get_active_lane_mask is lowered to a whilelo instruction.
20001 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20002 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20003 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20004 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20005 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20006 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20007 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20008 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20009 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20010 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt)))
20011 return true;
20012
20013 return false;
20014}
20015
20016// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20017// ... into: "ptrue p, all" + PTEST
20018static SDValue
20021 const AArch64Subtarget *Subtarget) {
20022 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20023 // Make sure PTEST can be legalised with illegal types.
20024 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20025 return SDValue();
20026
20027 SDValue N0 = N->getOperand(0);
20028 EVT VT = N0.getValueType();
20029
20030 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20031 !isNullConstant(N->getOperand(1)))
20032 return SDValue();
20033
20034 // Restricted the DAG combine to only cases where we're extracting from a
20035 // flag-setting operation.
20036 if (!isPredicateCCSettingOp(N0))
20037 return SDValue();
20038
20039 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20040 SelectionDAG &DAG = DCI.DAG;
20041 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20042 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20043}
20044
20045// Materialize : Idx = (add (mul vscale, NumEls), -1)
20046// i1 = extract_vector_elt t37, Constant:i64<Idx>
20047// ... into: "ptrue p, all" + PTEST
20048static SDValue
20051 const AArch64Subtarget *Subtarget) {
20052 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20053 // Make sure PTEST is legal types.
20054 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20055 return SDValue();
20056
20057 SDValue N0 = N->getOperand(0);
20058 EVT OpVT = N0.getValueType();
20059
20060 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20061 return SDValue();
20062
20063 // Idx == (add (mul vscale, NumEls), -1)
20064 SDValue Idx = N->getOperand(1);
20065 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20066 return SDValue();
20067
20068 SDValue VS = Idx.getOperand(0);
20069 if (VS.getOpcode() != ISD::VSCALE)
20070 return SDValue();
20071
20072 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20073 if (VS.getConstantOperandVal(0) != NumEls)
20074 return SDValue();
20075
20076 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20077 SelectionDAG &DAG = DCI.DAG;
20078 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20079 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20080}
20081
20082static SDValue
20084 const AArch64Subtarget *Subtarget) {
20085 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20086 SelectionDAG &DAG = DCI.DAG;
20087 SDValue Vec = N->getOperand(0);
20088 SDValue Idx = N->getOperand(1);
20089
20090 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20091 return SDValue();
20092
20093 // Only legal for 8, 16, 32, and 64 bit element types.
20094 EVT EltVT = Vec.getValueType().getVectorElementType();
20095 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20096 MVT::bf16, MVT::f32, MVT::f64}),
20097 EltVT.getSimpleVT().SimpleTy))
20098 return SDValue();
20099
20100 SDValue Mask = Idx.getOperand(0);
20101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20102 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20103 return SDValue();
20104
20105 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20106 Vec);
20107}
20108
20109static SDValue
20111 const AArch64Subtarget *Subtarget) {
20112 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20113 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20114 return Res;
20115 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20116 return Res;
20117 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20118 return Res;
20119
20120 SelectionDAG &DAG = DCI.DAG;
20121 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20122
20123 EVT VT = N->getValueType(0);
20124 const bool FullFP16 = Subtarget->hasFullFP16();
20125 bool IsStrict = N0->isStrictFPOpcode();
20126
20127 // extract(dup x) -> x
20128 if (N0.getOpcode() == AArch64ISD::DUP)
20129 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20130 : N0.getOperand(0);
20131
20132 // Rewrite for pairwise fadd pattern
20133 // (f32 (extract_vector_elt
20134 // (fadd (vXf32 Other)
20135 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20136 // ->
20137 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20138 // (extract_vector_elt (vXf32 Other) 1))
20139 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20140 // we can only do this when it's used only by the extract_vector_elt.
20141 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20142 (!IsStrict || N0.hasOneUse())) {
20143 SDLoc DL(N0);
20144 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20145 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20146
20147 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
20148 SDValue Other = N00;
20149
20150 // And handle the commutative case.
20151 if (!Shuffle) {
20152 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20153 Other = N01;
20154 }
20155
20156 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20157 Other == Shuffle->getOperand(0)) {
20158 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20159 DAG.getConstant(0, DL, MVT::i64));
20160 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20161 DAG.getConstant(1, DL, MVT::i64));
20162 if (!IsStrict)
20163 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20164
20165 // For strict_fadd we need uses of the final extract_vector to be replaced
20166 // with the strict_fadd, but we also need uses of the chain output of the
20167 // original strict_fadd to use the chain output of the new strict_fadd as
20168 // otherwise it may not be deleted.
20169 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20170 {VT, MVT::Other},
20171 {N0->getOperand(0), Extract1, Extract2});
20172 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20173 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20174 return SDValue(N, 0);
20175 }
20176 }
20177
20178 return SDValue();
20179}
20180
20183 SelectionDAG &DAG) {
20184 SDLoc DL(N);
20185 EVT VT = N->getValueType(0);
20186 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20187 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20188
20189 if (VT.isScalableVector())
20190 return SDValue();
20191
20192 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20193 N1Opc == ISD::TRUNCATE) {
20194 SDValue N00 = N0->getOperand(0);
20195 SDValue N10 = N1->getOperand(0);
20196 EVT N00VT = N00.getValueType();
20197 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20198
20199 // Optimize concat_vectors of truncated vectors, where the intermediate
20200 // type is illegal, to avoid said illegality, e.g.,
20201 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20202 // (v2i16 (truncate (v2i64)))))
20203 // ->
20204 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20205 // (v4i32 (bitcast (v2i64))),
20206 // <0, 2, 4, 6>)))
20207 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20208 // on both input and result type, so we might generate worse code.
20209 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20210 if (N00VT == N10.getValueType() &&
20211 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20212 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20213 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20215 for (size_t i = 0; i < Mask.size(); ++i)
20216 Mask[i] = i * 2;
20217 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20218 DAG.getVectorShuffle(
20219 MidVT, DL,
20220 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20221 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20222 }
20223
20224 // Optimize two large shifts and a combine into a single combine and shift
20225 // For AArch64 architectures, sequences like the following:
20226 //
20227 // ushr v0.4s, v0.4s, #20
20228 // ushr v1.4s, v1.4s, #20
20229 // uzp1 v0.8h, v0.8h, v1.8h
20230 //
20231 // Can be optimized to:
20232 //
20233 // uzp2 v0.8h, v0.8h, v1.8h
20234 // ushr v0.8h, v0.8h, #4
20235 //
20236 // This optimization reduces instruction count.
20237 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20238 N00->getOperand(1) == N10->getOperand(1)) {
20239 SDValue N000 = N00->getOperand(0);
20240 SDValue N100 = N10->getOperand(0);
20241 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20242 N101ConstVal = N10->getConstantOperandVal(1),
20243 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20244
20245 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20246 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20247 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20248 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20249 SDValue NewShiftConstant =
20250 DAG.getConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20251
20252 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20253 }
20254 }
20255 }
20256
20257 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20258 N->getOperand(0).getValueType() == MVT::v2i16 ||
20259 N->getOperand(0).getValueType() == MVT::v2i8) {
20260 EVT SrcVT = N->getOperand(0).getValueType();
20261 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20262 // loads to prevent having to go through the v4i8 load legalization that
20263 // needs to extend each element into a larger type.
20264 if (N->getNumOperands() % 2 == 0 &&
20265 all_of(N->op_values(), [SrcVT](SDValue V) {
20266 if (V.getValueType() != SrcVT)
20267 return false;
20268 if (V.isUndef())
20269 return true;
20270 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20271 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20272 LD->getExtensionType() == ISD::NON_EXTLOAD;
20273 })) {
20274 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20275 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20277
20278 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20279 SDValue V = N->getOperand(i);
20280 if (V.isUndef())
20281 Ops.push_back(DAG.getUNDEF(FVT));
20282 else {
20283 LoadSDNode *LD = cast<LoadSDNode>(V);
20284 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20285 LD->getBasePtr(), LD->getMemOperand());
20286 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20287 Ops.push_back(NewLoad);
20288 }
20289 }
20290 return DAG.getBitcast(N->getValueType(0),
20291 DAG.getBuildVector(NVT, DL, Ops));
20292 }
20293 }
20294
20295 // Canonicalise concat_vectors to replace concatenations of truncated nots
20296 // with nots of concatenated truncates. This in some cases allows for multiple
20297 // redundant negations to be eliminated.
20298 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20299 // (v4i16 (truncate (not (v4i32)))))
20300 // ->
20301 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20302 // (v4i16 (truncate (v4i32)))))
20303 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20304 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20305 N->isOnlyUserOf(N1.getNode())) {
20306 auto isBitwiseVectorNegate = [](SDValue V) {
20307 return V->getOpcode() == ISD::XOR &&
20308 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20309 };
20310 SDValue N00 = N0->getOperand(0);
20311 SDValue N10 = N1->getOperand(0);
20312 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20313 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20314 return DAG.getNOT(
20315 DL,
20318 N00->getOperand(0)),
20320 N10->getOperand(0))),
20321 VT);
20322 }
20323 }
20324
20325 // Wait till after everything is legalized to try this. That way we have
20326 // legal vector types and such.
20327 if (DCI.isBeforeLegalizeOps())
20328 return SDValue();
20329
20330 // Optimise concat_vectors of two identical binops with a 128-bit destination
20331 // size, combine into an binop of two contacts of the source vectors. eg:
20332 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20333 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20334 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20335 isVectorizedBinOp(N0Opc)) &&
20336 N0->hasOneUse() && N1->hasOneUse()) {
20337 SDValue N00 = N0->getOperand(0);
20338 SDValue N01 = N0->getOperand(1);
20339 SDValue N10 = N1->getOperand(0);
20340 SDValue N11 = N1->getOperand(1);
20341
20342 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20343 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20344 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20345 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20346 }
20347 }
20348
20349 auto IsRSHRN = [](SDValue Shr) {
20350 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20351 return false;
20352 SDValue Op = Shr.getOperand(0);
20353 EVT VT = Op.getValueType();
20354 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20355 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20356 return false;
20357
20358 APInt Imm;
20359 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20360 Imm = APInt(VT.getScalarSizeInBits(),
20361 Op.getOperand(1).getConstantOperandVal(0)
20362 << Op.getOperand(1).getConstantOperandVal(1));
20363 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20364 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20365 Imm = APInt(VT.getScalarSizeInBits(),
20366 Op.getOperand(1).getConstantOperandVal(0));
20367 else
20368 return false;
20369
20370 if (Imm != 1ULL << (ShtAmt - 1))
20371 return false;
20372 return true;
20373 };
20374
20375 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20376 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20377 ((IsRSHRN(N1) &&
20379 N1.isUndef())) {
20380 SDValue X = N0.getOperand(0).getOperand(0);
20381 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20382 : N1.getOperand(0).getOperand(0);
20383 EVT BVT =
20384 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20385 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20386 SDValue Add = DAG.getNode(
20387 ISD::ADD, DL, BVT, CC,
20388 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20389 SDValue Shr =
20390 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20391 return Shr;
20392 }
20393
20394 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20395 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20396 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20397 N0.getOperand(1) == N1.getOperand(1)) {
20398 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20399 DAG.getUNDEF(N0.getValueType()));
20400 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20401 DAG.getUNDEF(N0.getValueType()));
20402 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20403 }
20404
20405 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20406 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20407 // canonicalise to that.
20408 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20409 assert(VT.getScalarSizeInBits() == 64);
20410 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20411 DAG.getConstant(0, DL, MVT::i64));
20412 }
20413
20414 // Canonicalise concat_vectors so that the right-hand vector has as few
20415 // bit-casts as possible before its real operation. The primary matching
20416 // destination for these operations will be the narrowing "2" instructions,
20417 // which depend on the operation being performed on this right-hand vector.
20418 // For example,
20419 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20420 // becomes
20421 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20422
20423 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20424 return SDValue();
20425 SDValue RHS = N1->getOperand(0);
20426 MVT RHSTy = RHS.getValueType().getSimpleVT();
20427 // If the RHS is not a vector, this is not the pattern we're looking for.
20428 if (!RHSTy.isVector())
20429 return SDValue();
20430
20431 LLVM_DEBUG(
20432 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20433
20434 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20435 RHSTy.getVectorNumElements() * 2);
20436 return DAG.getNode(ISD::BITCAST, DL, VT,
20437 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
20438 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
20439 RHS));
20440}
20441
20442static SDValue
20444 SelectionDAG &DAG) {
20445 if (DCI.isBeforeLegalizeOps())
20446 return SDValue();
20447
20448 EVT VT = N->getValueType(0);
20449 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20450 return SDValue();
20451
20452 SDValue V = N->getOperand(0);
20453
20454 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20455 // blocks this combine because the non-const case requires custom lowering.
20456 //
20457 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20458 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20459 if (isa<ConstantSDNode>(V.getOperand(0)))
20460 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20461
20462 return SDValue();
20463}
20464
20465static SDValue
20467 SelectionDAG &DAG) {
20468 SDLoc DL(N);
20469 SDValue Vec = N->getOperand(0);
20470 SDValue SubVec = N->getOperand(1);
20471 uint64_t IdxVal = N->getConstantOperandVal(2);
20472 EVT VecVT = Vec.getValueType();
20473 EVT SubVT = SubVec.getValueType();
20474
20475 // Promote fixed length vector zeros.
20476 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20477 Vec.isUndef() && isZerosVector(SubVec.getNode()))
20478 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
20479 : DAG.getConstantFP(0, DL, VecVT);
20480
20481 // Only do this for legal fixed vector types.
20482 if (!VecVT.isFixedLengthVector() ||
20483 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20484 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20485 return SDValue();
20486
20487 // Ignore widening patterns.
20488 if (IdxVal == 0 && Vec.isUndef())
20489 return SDValue();
20490
20491 // Subvector must be half the width and an "aligned" insertion.
20492 unsigned NumSubElts = SubVT.getVectorNumElements();
20493 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20494 (IdxVal != 0 && IdxVal != NumSubElts))
20495 return SDValue();
20496
20497 // Fold insert_subvector -> concat_vectors
20498 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20499 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20500 SDValue Lo, Hi;
20501 if (IdxVal == 0) {
20502 Lo = SubVec;
20503 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20504 DAG.getVectorIdxConstant(NumSubElts, DL));
20505 } else {
20506 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20507 DAG.getVectorIdxConstant(0, DL));
20508 Hi = SubVec;
20509 }
20510 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20511}
20512
20515 SelectionDAG &DAG) {
20516 // Wait until after everything is legalized to try this. That way we have
20517 // legal vector types and such.
20518 if (DCI.isBeforeLegalizeOps())
20519 return SDValue();
20520 // Transform a scalar conversion of a value from a lane extract into a
20521 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20522 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20523 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20524 //
20525 // The second form interacts better with instruction selection and the
20526 // register allocator to avoid cross-class register copies that aren't
20527 // coalescable due to a lane reference.
20528
20529 // Check the operand and see if it originates from a lane extract.
20530 SDValue Op1 = N->getOperand(1);
20532 return SDValue();
20533
20534 // Yep, no additional predication needed. Perform the transform.
20535 SDValue IID = N->getOperand(0);
20536 SDValue Shift = N->getOperand(2);
20537 SDValue Vec = Op1.getOperand(0);
20538 SDValue Lane = Op1.getOperand(1);
20539 EVT ResTy = N->getValueType(0);
20540 EVT VecResTy;
20541 SDLoc DL(N);
20542
20543 // The vector width should be 128 bits by the time we get here, even
20544 // if it started as 64 bits (the extract_vector handling will have
20545 // done so). Bail if it is not.
20546 if (Vec.getValueSizeInBits() != 128)
20547 return SDValue();
20548
20549 if (Vec.getValueType() == MVT::v4i32)
20550 VecResTy = MVT::v4f32;
20551 else if (Vec.getValueType() == MVT::v2i64)
20552 VecResTy = MVT::v2f64;
20553 else
20554 return SDValue();
20555
20556 SDValue Convert =
20557 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20559}
20560
20561// AArch64 high-vector "long" operations are formed by performing the non-high
20562// version on an extract_subvector of each operand which gets the high half:
20563//
20564// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20565//
20566// However, there are cases which don't have an extract_high explicitly, but
20567// have another operation that can be made compatible with one for free. For
20568// example:
20569//
20570// (dupv64 scalar) --> (extract_high (dup128 scalar))
20571//
20572// This routine does the actual conversion of such DUPs, once outer routines
20573// have determined that everything else is in order.
20574// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20575// similarly here.
20577 MVT VT = N.getSimpleValueType();
20578 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20579 N.getConstantOperandVal(1) == 0)
20580 N = N.getOperand(0);
20581
20582 switch (N.getOpcode()) {
20583 case AArch64ISD::DUP:
20584 case AArch64ISD::DUPLANE8:
20585 case AArch64ISD::DUPLANE16:
20586 case AArch64ISD::DUPLANE32:
20587 case AArch64ISD::DUPLANE64:
20588 case AArch64ISD::MOVI:
20589 case AArch64ISD::MOVIshift:
20590 case AArch64ISD::MOVIedit:
20591 case AArch64ISD::MOVImsl:
20592 case AArch64ISD::MVNIshift:
20593 case AArch64ISD::MVNImsl:
20594 break;
20595 default:
20596 // FMOV could be supported, but isn't very useful, as it would only occur
20597 // if you passed a bitcast' floating point immediate to an eligible long
20598 // integer op (addl, smull, ...).
20599 return SDValue();
20600 }
20601
20602 if (!VT.is64BitVector())
20603 return SDValue();
20604
20605 SDLoc DL(N);
20606 unsigned NumElems = VT.getVectorNumElements();
20607 if (N.getValueType().is64BitVector()) {
20608 MVT ElementTy = VT.getVectorElementType();
20609 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20610 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20611 }
20612
20613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20614 DAG.getConstant(NumElems, DL, MVT::i64));
20615}
20616
20618 if (N.getOpcode() == ISD::BITCAST)
20619 N = N.getOperand(0);
20620 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20621 return false;
20622 if (N.getOperand(0).getValueType().isScalableVector())
20623 return false;
20624 return N.getConstantOperandAPInt(1) ==
20625 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20626}
20627
20628/// Helper structure to keep track of ISD::SET_CC operands.
20633};
20634
20635/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20637 const SDValue *Cmp;
20639};
20640
20641/// Helper structure to keep track of SetCC information.
20645};
20646
20647/// Helper structure to be able to read SetCC information. If set to
20648/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20649/// GenericSetCCInfo.
20653};
20654
20655/// Check whether or not \p Op is a SET_CC operation, either a generic or
20656/// an
20657/// AArch64 lowered one.
20658/// \p SetCCInfo is filled accordingly.
20659/// \post SetCCInfo is meanginfull only when this function returns true.
20660/// \return True when Op is a kind of SET_CC operation.
20662 // If this is a setcc, this is straight forward.
20663 if (Op.getOpcode() == ISD::SETCC) {
20664 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20665 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20666 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20667 SetCCInfo.IsAArch64 = false;
20668 return true;
20669 }
20670 // Otherwise, check if this is a matching csel instruction.
20671 // In other words:
20672 // - csel 1, 0, cc
20673 // - csel 0, 1, !cc
20674 if (Op.getOpcode() != AArch64ISD::CSEL)
20675 return false;
20676 // Set the information about the operands.
20677 // TODO: we want the operands of the Cmp not the csel
20678 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20679 SetCCInfo.IsAArch64 = true;
20680 SetCCInfo.Info.AArch64.CC =
20681 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20682
20683 // Check that the operands matches the constraints:
20684 // (1) Both operands must be constants.
20685 // (2) One must be 1 and the other must be 0.
20686 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20687 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20688
20689 // Check (1).
20690 if (!TValue || !FValue)
20691 return false;
20692
20693 // Check (2).
20694 if (!TValue->isOne()) {
20695 // Update the comparison when we are interested in !cc.
20696 std::swap(TValue, FValue);
20697 SetCCInfo.Info.AArch64.CC =
20699 }
20700 return TValue->isOne() && FValue->isZero();
20701}
20702
20703// Returns true if Op is setcc or zext of setcc.
20704static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20705 if (isSetCC(Op, Info))
20706 return true;
20707 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20708 isSetCC(Op->getOperand(0), Info));
20709}
20710
20711// The folding we want to perform is:
20712// (add x, [zext] (setcc cc ...) )
20713// -->
20714// (csel x, (add x, 1), !cc ...)
20715//
20716// The latter will get matched to a CSINC instruction.
20718 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20719 SDValue LHS = Op->getOperand(0);
20720 SDValue RHS = Op->getOperand(1);
20721 SetCCInfoAndKind InfoAndKind;
20722
20723 // If both operands are a SET_CC, then we don't want to perform this
20724 // folding and create another csel as this results in more instructions
20725 // (and higher register usage).
20726 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
20727 isSetCCOrZExtSetCC(RHS, InfoAndKind))
20728 return SDValue();
20729
20730 // If neither operand is a SET_CC, give up.
20731 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
20732 std::swap(LHS, RHS);
20733 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
20734 return SDValue();
20735 }
20736
20737 // FIXME: This could be generatized to work for FP comparisons.
20738 EVT CmpVT = InfoAndKind.IsAArch64
20739 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
20740 : InfoAndKind.Info.Generic.Opnd0->getValueType();
20741 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20742 return SDValue();
20743
20744 SDValue CCVal;
20745 SDValue Cmp;
20746 SDLoc DL(Op);
20747 if (InfoAndKind.IsAArch64) {
20748 CCVal = DAG.getConstant(
20750 MVT::i32);
20751 Cmp = *InfoAndKind.Info.AArch64.Cmp;
20752 } else
20753 Cmp = getAArch64Cmp(
20754 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
20755 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
20756 DL);
20757
20758 EVT VT = Op->getValueType(0);
20759 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
20760 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
20761}
20762
20763// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
20765 EVT VT = N->getValueType(0);
20766 // Only scalar integer and vector types.
20767 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
20768 return SDValue();
20769
20770 SDValue LHS = N->getOperand(0);
20771 SDValue RHS = N->getOperand(1);
20772 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20773 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
20774 return SDValue();
20775
20776 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
20777 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
20778 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
20779 return SDValue();
20780
20781 SDValue Op1 = LHS->getOperand(0);
20782 SDValue Op2 = RHS->getOperand(0);
20783 EVT OpVT1 = Op1.getValueType();
20784 EVT OpVT2 = Op2.getValueType();
20785 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
20786 Op2.getOpcode() != AArch64ISD::UADDV ||
20787 OpVT1.getVectorElementType() != VT)
20788 return SDValue();
20789
20790 SDValue Val1 = Op1.getOperand(0);
20791 SDValue Val2 = Op2.getOperand(0);
20792 EVT ValVT = Val1->getValueType(0);
20793 SDLoc DL(N);
20794 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
20795 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
20796 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
20797 DAG.getConstant(0, DL, MVT::i64));
20798}
20799
20800/// Perform the scalar expression combine in the form of:
20801/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20802/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20804 EVT VT = N->getValueType(0);
20805 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
20806 return SDValue();
20807
20808 SDValue LHS = N->getOperand(0);
20809 SDValue RHS = N->getOperand(1);
20810
20811 // Handle commutivity.
20812 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20813 LHS.getOpcode() != AArch64ISD::CSNEG) {
20814 std::swap(LHS, RHS);
20815 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20816 LHS.getOpcode() != AArch64ISD::CSNEG) {
20817 return SDValue();
20818 }
20819 }
20820
20821 if (!LHS.hasOneUse())
20822 return SDValue();
20823
20824 AArch64CC::CondCode AArch64CC =
20825 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
20826
20827 // The CSEL should include a const one operand, and the CSNEG should include
20828 // One or NegOne operand.
20829 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
20830 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
20831 if (!CTVal || !CFVal)
20832 return SDValue();
20833
20834 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20835 (CTVal->isOne() || CFVal->isOne())) &&
20836 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
20837 (CTVal->isOne() || CFVal->isAllOnes())))
20838 return SDValue();
20839
20840 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20841 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20842 !CFVal->isOne()) {
20843 std::swap(CTVal, CFVal);
20844 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20845 }
20846
20847 SDLoc DL(N);
20848 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20849 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20850 !CFVal->isAllOnes()) {
20851 APInt C = -1 * CFVal->getAPIntValue();
20852 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
20853 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
20854 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20855 }
20856
20857 // It might be neutral for larger constants, as the immediate need to be
20858 // materialized in a register.
20859 APInt ADDC = CTVal->getAPIntValue();
20860 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20861 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20862 return SDValue();
20863
20864 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
20865 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20866 "Unexpected constant value");
20867
20868 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
20869 SDValue CCVal = getCondCode(DAG, AArch64CC);
20870 SDValue Cmp = LHS.getOperand(3);
20871
20872 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
20873}
20874
20875// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
20877 EVT VT = N->getValueType(0);
20878 if (N->getOpcode() != ISD::ADD)
20879 return SDValue();
20880
20881 SDValue Dot = N->getOperand(0);
20882 SDValue A = N->getOperand(1);
20883 // Handle commutivity
20884 auto isZeroDot = [](SDValue Dot) {
20885 return (Dot.getOpcode() == AArch64ISD::UDOT ||
20886 Dot.getOpcode() == AArch64ISD::SDOT) &&
20888 };
20889 if (!isZeroDot(Dot))
20890 std::swap(Dot, A);
20891 if (!isZeroDot(Dot))
20892 return SDValue();
20893
20894 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
20895 Dot.getOperand(2));
20896}
20897
20899 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
20900}
20901
20903 SDLoc DL(Op);
20904 EVT VT = Op.getValueType();
20905 SDValue Zero = DAG.getConstant(0, DL, VT);
20906 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
20907}
20908
20909// Try to fold
20910//
20911// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
20912//
20913// The folding helps csel to be matched with csneg without generating
20914// redundant neg instruction, which includes negation of the csel expansion
20915// of abs node lowered by lowerABS.
20917 if (!isNegatedInteger(SDValue(N, 0)))
20918 return SDValue();
20919
20920 SDValue CSel = N->getOperand(1);
20921 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
20922 return SDValue();
20923
20924 SDValue N0 = CSel.getOperand(0);
20925 SDValue N1 = CSel.getOperand(1);
20926
20927 // If both of them is not negations, it's not worth the folding as it
20928 // introduces two additional negations while reducing one negation.
20929 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
20930 return SDValue();
20931
20932 SDValue N0N = getNegatedInteger(N0, DAG);
20933 SDValue N1N = getNegatedInteger(N1, DAG);
20934
20935 SDLoc DL(N);
20936 EVT VT = CSel.getValueType();
20937 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
20938 CSel.getOperand(3));
20939}
20940
20941// The basic add/sub long vector instructions have variants with "2" on the end
20942// which act on the high-half of their inputs. They are normally matched by
20943// patterns like:
20944//
20945// (add (zeroext (extract_high LHS)),
20946// (zeroext (extract_high RHS)))
20947// -> uaddl2 vD, vN, vM
20948//
20949// However, if one of the extracts is something like a duplicate, this
20950// instruction can still be used profitably. This function puts the DAG into a
20951// more appropriate form for those patterns to trigger.
20954 SelectionDAG &DAG = DCI.DAG;
20955 if (DCI.isBeforeLegalizeOps())
20956 return SDValue();
20957
20958 MVT VT = N->getSimpleValueType(0);
20959 if (!VT.is128BitVector()) {
20960 if (N->getOpcode() == ISD::ADD)
20961 return performSetccAddFolding(N, DAG);
20962 return SDValue();
20963 }
20964
20965 // Make sure both branches are extended in the same way.
20966 SDValue LHS = N->getOperand(0);
20967 SDValue RHS = N->getOperand(1);
20968 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20969 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20970 LHS.getOpcode() != RHS.getOpcode())
20971 return SDValue();
20972
20973 unsigned ExtType = LHS.getOpcode();
20974
20975 // It's not worth doing if at least one of the inputs isn't already an
20976 // extract, but we don't know which it'll be so we have to try both.
20977 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
20978 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
20979 if (!RHS.getNode())
20980 return SDValue();
20981
20982 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
20983 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
20984 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
20985 if (!LHS.getNode())
20986 return SDValue();
20987
20988 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
20989 }
20990
20991 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
20992}
20993
20994static bool isCMP(SDValue Op) {
20995 return Op.getOpcode() == AArch64ISD::SUBS &&
20996 !Op.getNode()->hasAnyUseOfValue(0);
20997}
20998
20999// (CSEL 1 0 CC Cond) => CC
21000// (CSEL 0 1 CC Cond) => !CC
21001static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21002 if (Op.getOpcode() != AArch64ISD::CSEL)
21003 return std::nullopt;
21004 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21005 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21006 return std::nullopt;
21007 SDValue OpLHS = Op.getOperand(0);
21008 SDValue OpRHS = Op.getOperand(1);
21009 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21010 return CC;
21011 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21012 return getInvertedCondCode(CC);
21013
21014 return std::nullopt;
21015}
21016
21017// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21018// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21019static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21020 SDValue CmpOp = Op->getOperand(2);
21021 if (!isCMP(CmpOp))
21022 return SDValue();
21023
21024 if (IsAdd) {
21025 if (!isOneConstant(CmpOp.getOperand(1)))
21026 return SDValue();
21027 } else {
21028 if (!isNullConstant(CmpOp.getOperand(0)))
21029 return SDValue();
21030 }
21031
21032 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21033 auto CC = getCSETCondCode(CsetOp);
21034 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21035 return SDValue();
21036
21037 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21038 Op->getOperand(0), Op->getOperand(1),
21039 CsetOp.getOperand(3));
21040}
21041
21042// (ADC x 0 cond) => (CINC x HS cond)
21044 SDValue LHS = N->getOperand(0);
21045 SDValue RHS = N->getOperand(1);
21046 SDValue Cond = N->getOperand(2);
21047
21048 if (!isNullConstant(RHS))
21049 return SDValue();
21050
21051 EVT VT = N->getValueType(0);
21052 SDLoc DL(N);
21053
21054 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21056 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21057}
21058
21061 SelectionDAG &DAG) {
21062 SDLoc DL(N);
21063 EVT VT = N->getValueType(0);
21064
21066 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
21067 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
21068 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
21069 if (Elt0->getOpcode() == ISD::FP_ROUND &&
21070 Elt1->getOpcode() == ISD::FP_ROUND &&
21071 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21072 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21073 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
21075 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21076 // Constant index.
21077 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
21078 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21079 Elt0->getOperand(0)->getOperand(0) ==
21080 Elt1->getOperand(0)->getOperand(0) &&
21081 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
21082 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
21083 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
21084 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
21085 SDValue HighLanes;
21086 if (Elt2->getOpcode() == ISD::UNDEF &&
21087 Elt3->getOpcode() == ISD::UNDEF) {
21088 HighLanes = DAG.getUNDEF(MVT::v2f32);
21089 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
21090 Elt3->getOpcode() == ISD::FP_ROUND &&
21091 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
21092 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
21093 Elt2->getConstantOperandVal(1) ==
21094 Elt3->getConstantOperandVal(1) &&
21095 Elt2->getOperand(0)->getOpcode() ==
21097 Elt3->getOperand(0)->getOpcode() ==
21099 // Constant index.
21100 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
21101 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
21102 Elt2->getOperand(0)->getOperand(0) ==
21103 Elt3->getOperand(0)->getOperand(0) &&
21104 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
21105 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
21106 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
21107 HighLanes =
21108 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
21109 }
21110 if (HighLanes) {
21111 SDValue DoubleToSingleSticky =
21112 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
21113 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
21114 DoubleToSingleSticky, HighLanes);
21115 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
21116 Elt0->getOperand(1));
21117 }
21118 }
21119 }
21120 }
21121
21122 if (VT == MVT::v2f64) {
21123 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21124 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
21125 Elt1->getOpcode() == ISD::FP_EXTEND &&
21127 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21128 Elt0->getOperand(0)->getOperand(0) ==
21129 Elt1->getOperand(0)->getOperand(0) &&
21130 // Constant index.
21131 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
21132 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21133 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
21134 Elt1->getOperand(0)->getConstantOperandVal(1) &&
21135 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21136 // ResultType's known minimum vector length.
21137 Elt0->getOperand(0)->getConstantOperandVal(1) %
21139 0) {
21140 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
21141 if (SrcVec.getValueType() == MVT::v4f16 ||
21142 SrcVec.getValueType() == MVT::v4bf16) {
21143 SDValue HalfToSingle =
21144 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
21145 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
21146 SDValue Extract = DAG.getNode(
21148 HalfToSingle, SubvectorIdx);
21149 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
21150 }
21151 }
21152 }
21153
21154 // A build vector of two extracted elements is equivalent to an
21155 // extract subvector where the inner vector is any-extended to the
21156 // extract_vector_elt VT.
21157 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
21158 // (extract_elt_iXX_to_i32 vec Idx+1))
21159 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
21160
21161 // For now, only consider the v2i32 case, which arises as a result of
21162 // legalization.
21163 if (VT != MVT::v2i32)
21164 return SDValue();
21165
21166 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21167 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
21168 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21169 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21170 // Constant index.
21171 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21172 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21173 // Both EXTRACT_VECTOR_ELT from same vector...
21174 Elt0->getOperand(0) == Elt1->getOperand(0) &&
21175 // ... and contiguous. First element's index +1 == second element's index.
21176 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
21177 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21178 // ResultType's known minimum vector length.
21179 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
21180 SDValue VecToExtend = Elt0->getOperand(0);
21181 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
21182 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
21183 return SDValue();
21184
21185 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
21186
21187 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
21188 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
21189 SubvectorIdx);
21190 }
21191
21192 return SDValue();
21193}
21194
21195// A special combine for the sqdmulh family of instructions.
21196// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21197// SATURATING_VAL ) can be reduced to sqdmulh(...)
21199
21200 if (N->getOpcode() != ISD::SMIN)
21201 return SDValue();
21202
21203 EVT DestVT = N->getValueType(0);
21204
21205 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21206 DestVT.isScalableVector())
21207 return SDValue();
21208
21209 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21210
21211 if (!Clamp)
21212 return SDValue();
21213
21214 MVT ScalarType;
21215 unsigned ShiftAmt = 0;
21216 switch (Clamp->getSExtValue()) {
21217 case (1ULL << 15) - 1:
21218 ScalarType = MVT::i16;
21219 ShiftAmt = 16;
21220 break;
21221 case (1ULL << 31) - 1:
21222 ScalarType = MVT::i32;
21223 ShiftAmt = 32;
21224 break;
21225 default:
21226 return SDValue();
21227 }
21228
21229 SDValue Sra = N->getOperand(0);
21230 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21231 return SDValue();
21232
21233 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21234 if (!RightShiftVec)
21235 return SDValue();
21236 unsigned SExtValue = RightShiftVec->getSExtValue();
21237
21238 if (SExtValue != (ShiftAmt - 1))
21239 return SDValue();
21240
21241 SDValue Mul = Sra.getOperand(0);
21242 if (Mul.getOpcode() != ISD::MUL)
21243 return SDValue();
21244
21245 SDValue SExt0 = Mul.getOperand(0);
21246 SDValue SExt1 = Mul.getOperand(1);
21247
21248 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21249 SExt1.getOpcode() != ISD::SIGN_EXTEND)
21250 return SDValue();
21251
21252 EVT SExt0Type = SExt0.getOperand(0).getValueType();
21253 EVT SExt1Type = SExt1.getOperand(0).getValueType();
21254
21255 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21256 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21257 SExt0Type.getVectorNumElements() == 1)
21258 return SDValue();
21259
21260 SDLoc DL(N);
21261 SDValue V0 = SExt0.getOperand(0);
21262 SDValue V1 = SExt1.getOperand(0);
21263
21264 // Ensure input vectors are extended to legal types
21265 if (SExt0Type.getFixedSizeInBits() < 64) {
21266 unsigned VecNumElements = SExt0Type.getVectorNumElements();
21267 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21268 VecNumElements);
21269 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21270 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21271 }
21272
21273 SDValue SQDMULH =
21274 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21275
21276 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21277}
21278
21280 if (SDValue V = trySQDMULHCombine(N, DAG)) {
21281 return V;
21282 }
21283
21284 return SDValue();
21285}
21286
21289 SDLoc DL(N);
21290 EVT VT = N->getValueType(0);
21291 SDValue N0 = N->getOperand(0);
21292 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
21293 N0.getOpcode() == AArch64ISD::DUP) {
21294 SDValue Op = N0.getOperand(0);
21295 if (VT.getScalarType() == MVT::i32 &&
21296 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
21297 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
21298 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
21299 }
21300
21301 // Performing the following combine produces a preferable form for ISEL.
21302 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
21304 N0.hasOneUse()) {
21305 SDValue Op = N0.getOperand(0);
21306 SDValue ExtractIndexNode = N0.getOperand(1);
21307 if (!isa<ConstantSDNode>(ExtractIndexNode))
21308 return SDValue();
21309
21310 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
21311 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
21312 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
21313 "Unexpected legalisation result!");
21314
21315 EVT SrcVectorType = Op.getValueType();
21316 // We also assume that SrcVectorType cannot be a V64 (see
21317 // LowerEXTRACT_VECTOR_ELT).
21318 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
21319 "Unexpected legalisation result!");
21320
21321 unsigned ExtractIndex =
21322 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
21323 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
21324
21325 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
21326 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
21327 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
21328 }
21329
21330 return SDValue();
21331}
21332
21333// Check an node is an extend or shift operand
21335 unsigned Opcode = N.getOpcode();
21336 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
21337 EVT SrcVT;
21338 if (Opcode == ISD::SIGN_EXTEND_INREG)
21339 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
21340 else
21341 SrcVT = N.getOperand(0).getValueType();
21342
21343 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
21344 } else if (Opcode == ISD::AND) {
21345 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
21346 if (!CSD)
21347 return false;
21348 uint64_t AndMask = CSD->getZExtValue();
21349 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
21350 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
21351 return isa<ConstantSDNode>(N.getOperand(1));
21352 }
21353
21354 return false;
21355}
21356
21357// (N - Y) + Z --> (Z - Y) + N
21358// when N is an extend or shift operand
21360 SelectionDAG &DAG) {
21361 auto IsOneUseExtend = [](SDValue N) {
21362 return N.hasOneUse() && isExtendOrShiftOperand(N);
21363 };
21364
21365 // DAGCombiner will revert the combination when Z is constant cause
21366 // dead loop. So don't enable the combination when Z is constant.
21367 // If Z is one use shift C, we also can't do the optimization.
21368 // It will falling to self infinite loop.
21369 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
21370 return SDValue();
21371
21372 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21373 return SDValue();
21374
21375 SDValue Shift = SUB.getOperand(0);
21376 if (!IsOneUseExtend(Shift))
21377 return SDValue();
21378
21379 SDLoc DL(N);
21380 EVT VT = N->getValueType(0);
21381
21382 SDValue Y = SUB.getOperand(1);
21383 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
21384 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
21385}
21386
21388 SelectionDAG &DAG) {
21389 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21390 // commutative.
21391 if (N->getOpcode() != ISD::ADD)
21392 return SDValue();
21393
21394 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21395 // shifted register is only available for i32 and i64.
21396 EVT VT = N->getValueType(0);
21397 if (VT != MVT::i32 && VT != MVT::i64)
21398 return SDValue();
21399
21400 SDLoc DL(N);
21401 SDValue LHS = N->getOperand(0);
21402 SDValue RHS = N->getOperand(1);
21403
21404 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
21405 return Val;
21406 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
21407 return Val;
21408
21409 uint64_t LHSImm = 0, RHSImm = 0;
21410 // If both operand are shifted by imm and shift amount is not greater than 4
21411 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21412 // on RHS.
21413 //
21414 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21415 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21416 // with LSL (shift > 4). For the rest of processors, this is no-op for
21417 // performance or correctness.
21418 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21419 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21420 RHSImm > 4 && LHS.hasOneUse())
21421 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21422
21423 return SDValue();
21424}
21425
21426// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21427// This reassociates it back to allow the creation of more mls instructions.
21429 if (N->getOpcode() != ISD::SUB)
21430 return SDValue();
21431
21432 SDValue Add = N->getOperand(1);
21433 SDValue X = N->getOperand(0);
21434 if (Add.getOpcode() != ISD::ADD)
21435 return SDValue();
21436
21437 if (!Add.hasOneUse())
21438 return SDValue();
21440 return SDValue();
21441
21442 SDValue M1 = Add.getOperand(0);
21443 SDValue M2 = Add.getOperand(1);
21444 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21445 M1.getOpcode() != AArch64ISD::UMULL)
21446 return SDValue();
21447 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21448 M2.getOpcode() != AArch64ISD::UMULL)
21449 return SDValue();
21450
21451 EVT VT = N->getValueType(0);
21452 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21453 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21454}
21455
21456// Combine into mla/mls.
21457// This works on the patterns of:
21458// add v1, (mul v2, v3)
21459// sub v1, (mul v2, v3)
21460// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21461// It will transform the add/sub to a scalable version, so that we can
21462// make use of SVE's MLA/MLS that will be generated for that pattern
21463static SDValue
21465 SelectionDAG &DAG = DCI.DAG;
21466 // Make sure that the types are legal
21467 if (!DCI.isAfterLegalizeDAG())
21468 return SDValue();
21469 // Before using SVE's features, check first if it's available.
21470 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21471 return SDValue();
21472
21473 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21474 return SDValue();
21475
21476 if (!N->getValueType(0).isFixedLengthVector())
21477 return SDValue();
21478
21479 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21480 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21481 return SDValue();
21482
21483 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21484 return SDValue();
21485
21486 SDValue MulValue = Op1->getOperand(0);
21487 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21488 return SDValue();
21489
21490 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21491 return SDValue();
21492
21493 EVT ScalableVT = MulValue.getValueType();
21494 if (!ScalableVT.isScalableVector())
21495 return SDValue();
21496
21497 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21498 SDValue NewValue =
21499 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21500 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21501 };
21502
21503 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21504 return res;
21505 else if (N->getOpcode() == ISD::ADD)
21506 return performOpt(N->getOperand(1), N->getOperand(0));
21507
21508 return SDValue();
21509}
21510
21511// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21512// help, for example, to produce ssra from sshr+add.
21514 EVT VT = N->getValueType(0);
21515 if (VT != MVT::i64 ||
21516 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21517 return SDValue();
21518 SDValue Op0 = N->getOperand(0);
21519 SDValue Op1 = N->getOperand(1);
21520
21521 // At least one of the operands should be an extract, and the other should be
21522 // something that is easy to convert to v1i64 type (in this case a load).
21523 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21524 Op0.getOpcode() != ISD::LOAD)
21525 return SDValue();
21526 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21527 Op1.getOpcode() != ISD::LOAD)
21528 return SDValue();
21529
21530 SDLoc DL(N);
21531 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21532 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21533 Op0 = Op0.getOperand(0);
21534 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21535 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21536 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21537 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21538 Op1 = Op1.getOperand(0);
21539 } else
21540 return SDValue();
21541
21542 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21543 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21544 DAG.getConstant(0, DL, MVT::i64));
21545}
21546
21549 if (!BV->hasOneUse())
21550 return false;
21551 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21552 if (!Ld || !Ld->isSimple())
21553 return false;
21554 Loads.push_back(Ld);
21555 return true;
21556 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21558 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21559 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21560 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21561 return false;
21562 Loads.push_back(Ld);
21563 }
21564 return true;
21565 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21566 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21567 // are lowered. Note that this only comes up because we do not always visit
21568 // operands before uses. After that is fixed this can be removed and in the
21569 // meantime this is fairly specific to the lowering we expect from IR.
21570 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21571 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21572 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21573 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21574 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21575 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21576 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21577 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21578 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21579 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21580 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21581 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21582 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21583 B.getOperand(1).getNumOperands() != 4)
21584 return false;
21585 auto SV1 = cast<ShuffleVectorSDNode>(B);
21586 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21587 int NumElts = B.getValueType().getVectorNumElements();
21588 int NumSubElts = NumElts / 4;
21589 for (int I = 0; I < NumSubElts; I++) {
21590 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21591 if (SV1->getMaskElt(I) != I ||
21592 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21593 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21594 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21595 return false;
21596 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21597 if (SV2->getMaskElt(I) != I ||
21598 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21599 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21600 return false;
21601 }
21602 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21603 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21604 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21605 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21606 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21607 !Ld2->isSimple() || !Ld3->isSimple())
21608 return false;
21609 Loads.push_back(Ld0);
21610 Loads.push_back(Ld1);
21611 Loads.push_back(Ld2);
21612 Loads.push_back(Ld3);
21613 return true;
21614 }
21615 return false;
21616}
21617
21619 SelectionDAG &DAG,
21620 unsigned &NumSubLoads) {
21621 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21622 return false;
21623
21624 SmallVector<LoadSDNode *> Loads0, Loads1;
21625 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21626 isLoadOrMultipleLoads(Op1, Loads1)) {
21627 if (NumSubLoads && Loads0.size() != NumSubLoads)
21628 return false;
21629 NumSubLoads = Loads0.size();
21630 return Loads0.size() == Loads1.size() &&
21631 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21632 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21633 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21634 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
21635 Size / 8, 1);
21636 });
21637 }
21638
21639 if (Op0.getOpcode() != Op1.getOpcode())
21640 return false;
21641
21642 switch (Op0.getOpcode()) {
21643 case ISD::ADD:
21644 case ISD::SUB:
21646 DAG, NumSubLoads) &&
21648 DAG, NumSubLoads);
21649 case ISD::SIGN_EXTEND:
21650 case ISD::ANY_EXTEND:
21651 case ISD::ZERO_EXTEND:
21652 EVT XVT = Op0.getOperand(0).getValueType();
21653 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21654 XVT.getScalarSizeInBits() != 32)
21655 return false;
21657 DAG, NumSubLoads);
21658 }
21659 return false;
21660}
21661
21662// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21663// into a single load of twice the size, that we extract the bottom part and top
21664// part so that the shl can use a shll2 instruction. The two loads in that
21665// example can also be larger trees of instructions, which are identical except
21666// for the leaves which are all loads offset from the LHS, including
21667// buildvectors of multiple loads. For example the RHS tree could be
21668// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21669// Whilst it can be common for the larger loads to replace LDP instructions
21670// (which doesn't gain anything on it's own), the larger loads can help create
21671// more efficient code, and in buildvectors prevent the need for ld1 lane
21672// inserts which can be slower than normal loads.
21674 EVT VT = N->getValueType(0);
21675 if (!VT.isFixedLengthVector() ||
21676 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21677 VT.getScalarSizeInBits() != 64))
21678 return SDValue();
21679
21680 SDValue Other = N->getOperand(0);
21681 SDValue Shift = N->getOperand(1);
21682 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21683 std::swap(Shift, Other);
21684 APInt ShiftAmt;
21685 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21686 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
21687 return SDValue();
21688
21689 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
21690 !ISD::isExtOpcode(Other.getOpcode()) ||
21691 Shift.getOperand(0).getOperand(0).getValueType() !=
21692 Other.getOperand(0).getValueType() ||
21693 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
21694 return SDValue();
21695
21696 SDValue Op0 = Other.getOperand(0);
21697 SDValue Op1 = Shift.getOperand(0).getOperand(0);
21698
21699 unsigned NumSubLoads = 0;
21700 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21701 return SDValue();
21702
21703 // Attempt to rule out some unprofitable cases using heuristics (some working
21704 // around suboptimal code generation), notably if the extend not be able to
21705 // use ushll2 instructions as the types are not large enough. Otherwise zip's
21706 // will need to be created which can increase the instruction count.
21707 unsigned NumElts = Op0.getValueType().getVectorNumElements();
21708 unsigned NumSubElts = NumElts / NumSubLoads;
21709 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21710 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
21711 Op0.getValueType().getSizeInBits() < 128 &&
21713 return SDValue();
21714
21715 // Recreate the tree with the new combined loads.
21716 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21717 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21718 EVT DVT =
21720
21721 SmallVector<LoadSDNode *> Loads0, Loads1;
21722 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21723 isLoadOrMultipleLoads(Op1, Loads1)) {
21724 EVT LoadVT = EVT::getVectorVT(
21725 *DAG.getContext(), Op0.getValueType().getScalarType(),
21726 Op0.getValueType().getVectorNumElements() / Loads0.size());
21727 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21728
21729 SmallVector<SDValue> NewLoads;
21730 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
21731 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
21732 L0->getBasePtr(), L0->getPointerInfo(),
21733 L0->getBaseAlign());
21734 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
21735 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
21736 NewLoads.push_back(Load);
21737 }
21738 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
21739 }
21740
21742 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
21743 Ops.push_back(GenCombinedTree(O0, O1, DAG));
21744 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
21745 };
21746 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
21747
21748 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
21749 int Hi = NumSubElts, Lo = 0;
21750 for (unsigned i = 0; i < NumSubLoads; i++) {
21751 for (unsigned j = 0; j < NumSubElts; j++) {
21752 LowMask[i * NumSubElts + j] = Lo++;
21753 HighMask[i * NumSubElts + j] = Hi++;
21754 }
21755 Lo += NumSubElts;
21756 Hi += NumSubElts;
21757 }
21758 SDLoc DL(N);
21759 SDValue Ext0, Ext1;
21760 // Extract the top and bottom lanes, then extend the result. Possibly extend
21761 // the result then extract the lanes if the two operands match as it produces
21762 // slightly smaller code.
21763 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
21765 NewOp, DAG.getConstant(0, DL, MVT::i64));
21766 SDValue SubH =
21767 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
21768 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21769 SDValue Extr0 =
21770 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
21771 SDValue Extr1 =
21772 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
21773 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
21774 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
21775 } else {
21777 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
21778 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21779 DAG.getConstant(0, DL, MVT::i64));
21780 SDValue SubH =
21781 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21782 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21783 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
21784 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
21785 }
21786 SDValue NShift =
21787 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
21788 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
21789}
21790
21793 // Try to change sum of two reductions.
21794 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
21795 return Val;
21796 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
21797 return Val;
21798 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
21799 return Val;
21800 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
21801 return Val;
21802 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
21803 return Val;
21805 return Val;
21806 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
21807 return Val;
21808 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21809 return Val;
21810 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
21811 return Val;
21812
21813 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
21814 return Val;
21815
21816 return performAddSubLongCombine(N, DCI);
21817}
21818
21819// Massage DAGs which we can use the high-half "long" operations on into
21820// something isel will recognize better. E.g.
21821//
21822// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21823// (aarch64_neon_umull (extract_high (v2i64 vec)))
21824// (extract_high (v2i64 (dup128 scalar)))))
21825//
21828 SelectionDAG &DAG) {
21829 if (DCI.isBeforeLegalizeOps())
21830 return SDValue();
21831
21832 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
21833 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
21834 assert(LHS.getValueType().is64BitVector() &&
21835 RHS.getValueType().is64BitVector() &&
21836 "unexpected shape for long operation");
21837
21838 // Either node could be a DUP, but it's not worth doing both of them (you'd
21839 // just as well use the non-high version) so look for a corresponding extract
21840 // operation on the other "wing".
21843 if (!RHS.getNode())
21844 return SDValue();
21847 if (!LHS.getNode())
21848 return SDValue();
21849 } else
21850 return SDValue();
21851
21852 if (IID == Intrinsic::not_intrinsic)
21853 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
21854
21855 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
21856 N->getOperand(0), LHS, RHS);
21857}
21858
21859static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21860 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
21861 unsigned ElemBits = ElemTy.getSizeInBits();
21862
21863 int64_t ShiftAmount;
21864 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
21865 APInt SplatValue, SplatUndef;
21866 unsigned SplatBitSize;
21867 bool HasAnyUndefs;
21868 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21869 HasAnyUndefs, ElemBits) ||
21870 SplatBitSize != ElemBits)
21871 return SDValue();
21872
21873 ShiftAmount = SplatValue.getSExtValue();
21874 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
21875 ShiftAmount = CVN->getSExtValue();
21876 } else
21877 return SDValue();
21878
21879 // If the shift amount is zero, remove the shift intrinsic.
21880 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
21881 return N->getOperand(1);
21882
21883 unsigned Opcode;
21884 bool IsRightShift;
21885 switch (IID) {
21886 default:
21887 llvm_unreachable("Unknown shift intrinsic");
21888 case Intrinsic::aarch64_neon_sqshl:
21889 Opcode = AArch64ISD::SQSHL_I;
21890 IsRightShift = false;
21891 break;
21892 case Intrinsic::aarch64_neon_uqshl:
21893 Opcode = AArch64ISD::UQSHL_I;
21894 IsRightShift = false;
21895 break;
21896 case Intrinsic::aarch64_neon_srshl:
21897 Opcode = AArch64ISD::SRSHR_I;
21898 IsRightShift = true;
21899 break;
21900 case Intrinsic::aarch64_neon_urshl:
21901 Opcode = AArch64ISD::URSHR_I;
21902 IsRightShift = true;
21903 break;
21904 case Intrinsic::aarch64_neon_sqshlu:
21905 Opcode = AArch64ISD::SQSHLU_I;
21906 IsRightShift = false;
21907 break;
21908 case Intrinsic::aarch64_neon_sshl:
21909 case Intrinsic::aarch64_neon_ushl:
21910 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
21911 // left shift for positive shift amounts. For negative shifts we can use a
21912 // VASHR/VLSHR as appropriate.
21913 if (ShiftAmount < 0) {
21914 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
21915 : AArch64ISD::VLSHR;
21916 ShiftAmount = -ShiftAmount;
21917 } else
21918 Opcode = AArch64ISD::VSHL;
21919 IsRightShift = false;
21920 break;
21921 }
21922
21923 EVT VT = N->getValueType(0);
21924 SDValue Op = N->getOperand(1);
21925 SDLoc DL(N);
21926 if (VT == MVT::i64) {
21927 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
21928 VT = MVT::v1i64;
21929 }
21930
21931 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
21932 Op = DAG.getNode(Opcode, DL, VT, Op,
21933 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32));
21934 if (N->getValueType(0) == MVT::i64)
21935 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
21936 DAG.getConstant(0, DL, MVT::i64));
21937 return Op;
21938 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
21939 Op = DAG.getNode(Opcode, DL, VT, Op,
21940 DAG.getConstant(ShiftAmount, DL, MVT::i32));
21941 if (N->getValueType(0) == MVT::i64)
21942 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
21943 DAG.getConstant(0, DL, MVT::i64));
21944 return Op;
21945 }
21946
21947 return SDValue();
21948}
21949
21950// The CRC32[BH] instructions ignore the high bits of their data operand. Since
21951// the intrinsics must be legal and take an i32, this means there's almost
21952// certainly going to be a zext in the DAG which we can eliminate.
21953static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
21954 SDValue AndN = N->getOperand(2);
21955 if (AndN.getOpcode() != ISD::AND)
21956 return SDValue();
21957
21958 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
21959 if (!CMask || CMask->getZExtValue() != Mask)
21960 return SDValue();
21961
21962 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
21963 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
21964}
21965
21967 SelectionDAG &DAG) {
21968 SDLoc DL(N);
21969 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
21970 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
21971 N->getOperand(1)),
21972 DAG.getConstant(0, DL, MVT::i64));
21973}
21974
21976 SDLoc DL(N);
21977 SDValue Op1 = N->getOperand(1);
21978 SDValue Op2 = N->getOperand(2);
21979 EVT ScalarTy = Op2.getValueType();
21980 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21981 ScalarTy = MVT::i32;
21982
21983 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
21984 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
21985 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
21986 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
21987 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
21988 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
21989}
21990
21992 SDLoc DL(N);
21993 SDValue Scalar = N->getOperand(3);
21994 EVT ScalarTy = Scalar.getValueType();
21995
21996 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21997 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
21998
21999 SDValue Passthru = N->getOperand(1);
22000 SDValue Pred = N->getOperand(2);
22001 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
22002 Pred, Scalar, Passthru);
22003}
22004
22006 SDLoc DL(N);
22007 LLVMContext &Ctx = *DAG.getContext();
22008 EVT VT = N->getValueType(0);
22009
22010 assert(VT.isScalableVector() && "Expected a scalable vector.");
22011
22012 // Current lowering only supports the SVE-ACLE types.
22014 return SDValue();
22015
22016 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
22017 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
22018 EVT ByteVT =
22019 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
22020
22021 // Convert everything to the domain of EXT (i.e bytes).
22022 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
22023 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
22024 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
22025 DAG.getConstant(ElemSize, DL, MVT::i32));
22026
22027 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
22028 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
22029}
22030
22033 SelectionDAG &DAG) {
22034 if (DCI.isBeforeLegalize())
22035 return SDValue();
22036
22037 SDValue Comparator = N->getOperand(3);
22038 if (Comparator.getOpcode() == AArch64ISD::DUP ||
22039 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
22040 unsigned IID = getIntrinsicID(N);
22041 EVT VT = N->getValueType(0);
22042 EVT CmpVT = N->getOperand(2).getValueType();
22043 SDValue Pred = N->getOperand(1);
22044 SDValue Imm;
22045 SDLoc DL(N);
22046
22047 switch (IID) {
22048 default:
22049 llvm_unreachable("Called with wrong intrinsic!");
22050 break;
22051
22052 // Signed comparisons
22053 case Intrinsic::aarch64_sve_cmpeq_wide:
22054 case Intrinsic::aarch64_sve_cmpne_wide:
22055 case Intrinsic::aarch64_sve_cmpge_wide:
22056 case Intrinsic::aarch64_sve_cmpgt_wide:
22057 case Intrinsic::aarch64_sve_cmplt_wide:
22058 case Intrinsic::aarch64_sve_cmple_wide: {
22059 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22060 int64_t ImmVal = CN->getSExtValue();
22061 if (ImmVal >= -16 && ImmVal <= 15)
22062 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
22063 else
22064 return SDValue();
22065 }
22066 break;
22067 }
22068 // Unsigned comparisons
22069 case Intrinsic::aarch64_sve_cmphs_wide:
22070 case Intrinsic::aarch64_sve_cmphi_wide:
22071 case Intrinsic::aarch64_sve_cmplo_wide:
22072 case Intrinsic::aarch64_sve_cmpls_wide: {
22073 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22074 uint64_t ImmVal = CN->getZExtValue();
22075 if (ImmVal <= 127)
22076 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
22077 else
22078 return SDValue();
22079 }
22080 break;
22081 }
22082 }
22083
22084 if (!Imm)
22085 return SDValue();
22086
22087 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
22088 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
22089 N->getOperand(2), Splat, DAG.getCondCode(CC));
22090 }
22091
22092 return SDValue();
22093}
22094
22097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22098
22099 SDLoc DL(Op);
22100 assert(Op.getValueType().isScalableVector() &&
22101 TLI.isTypeLegal(Op.getValueType()) &&
22102 "Expected legal scalable vector type!");
22103 assert(Op.getValueType() == Pg.getValueType() &&
22104 "Expected same type for PTEST operands");
22105
22106 // Ensure target specific opcodes are using legal type.
22107 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
22108 SDValue TVal = DAG.getConstant(1, DL, OutVT);
22109 SDValue FVal = DAG.getConstant(0, DL, OutVT);
22110
22111 // Ensure operands have type nxv16i1.
22112 if (Op.getValueType() != MVT::nxv16i1) {
22115 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
22116 else
22117 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
22118 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
22119 }
22120
22121 // Set condition code (CC) flags.
22122 SDValue Test = DAG.getNode(
22123 Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
22124 DL, MVT::i32, Pg, Op);
22125
22126 // Convert CC to integer based on requested condition.
22127 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
22128 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
22129 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
22130 return DAG.getZExtOrTrunc(Res, DL, VT);
22131}
22132
22134 SelectionDAG &DAG) {
22135 SDLoc DL(N);
22136
22137 SDValue Pred = N->getOperand(1);
22138 SDValue VecToReduce = N->getOperand(2);
22139
22140 // NOTE: The integer reduction's result type is not always linked to the
22141 // operand's element type so we construct it from the intrinsic's result type.
22142 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
22143 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22144
22145 // SVE reductions set the whole vector register with the first element
22146 // containing the reduction result, which we'll now extract.
22147 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22148 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22149 Zero);
22150}
22151
22153 SelectionDAG &DAG) {
22154 SDLoc DL(N);
22155
22156 SDValue Pred = N->getOperand(1);
22157 SDValue VecToReduce = N->getOperand(2);
22158
22159 EVT ReduceVT = VecToReduce.getValueType();
22160 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22161
22162 // SVE reductions set the whole vector register with the first element
22163 // containing the reduction result, which we'll now extract.
22164 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22165 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22166 Zero);
22167}
22168
22170 SelectionDAG &DAG) {
22171 SDLoc DL(N);
22172
22173 SDValue Pred = N->getOperand(1);
22174 SDValue InitVal = N->getOperand(2);
22175 SDValue VecToReduce = N->getOperand(3);
22176 EVT ReduceVT = VecToReduce.getValueType();
22177
22178 // Ordered reductions use the first lane of the result vector as the
22179 // reduction's initial value.
22180 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22181 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
22182 DAG.getUNDEF(ReduceVT), InitVal, Zero);
22183
22184 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
22185
22186 // SVE reductions set the whole vector register with the first element
22187 // containing the reduction result, which we'll now extract.
22188 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22189 Zero);
22190}
22191
22192// If a merged operation has no inactive lanes we can relax it to a predicated
22193// or unpredicated operation, which potentially allows better isel (perhaps
22194// using immediate forms) or relaxing register reuse requirements.
22196 SelectionDAG &DAG, bool UnpredOp = false,
22197 bool SwapOperands = false) {
22198 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
22199 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
22200 SDValue Pg = N->getOperand(1);
22201 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
22202 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
22203
22204 // ISD way to specify an all active predicate.
22205 if (isAllActivePredicate(DAG, Pg)) {
22206 if (UnpredOp)
22207 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
22208
22209 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
22210 }
22211
22212 // FUTURE: SplatVector(true)
22213 return SDValue();
22214}
22215
22217 const AArch64Subtarget *Subtarget,
22218 SelectionDAG &DAG) {
22219
22220 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
22221 getIntrinsicID(N) ==
22222 Intrinsic::experimental_vector_partial_reduce_add &&
22223 "Expected a partial reduction node");
22224
22225 bool Scalable = N->getValueType(0).isScalableVector();
22226 if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
22227 return SDValue();
22228 if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
22229 return SDValue();
22230
22231 SDLoc DL(N);
22232
22233 SDValue Op2 = N->getOperand(2);
22234 unsigned Op2Opcode = Op2->getOpcode();
22235 SDValue MulOpLHS, MulOpRHS;
22236 bool MulOpLHSIsSigned, MulOpRHSIsSigned;
22237 if (ISD::isExtOpcode(Op2Opcode)) {
22238 MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
22239 MulOpLHS = Op2->getOperand(0);
22240 MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
22241 } else if (Op2Opcode == ISD::MUL) {
22242 SDValue ExtMulOpLHS = Op2->getOperand(0);
22243 SDValue ExtMulOpRHS = Op2->getOperand(1);
22244
22245 unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
22246 unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
22247 if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
22248 !ISD::isExtOpcode(ExtMulOpRHSOpcode))
22249 return SDValue();
22250
22251 MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
22252 MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
22253
22254 MulOpLHS = ExtMulOpLHS->getOperand(0);
22255 MulOpRHS = ExtMulOpRHS->getOperand(0);
22256
22257 if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
22258 return SDValue();
22259 } else
22260 return SDValue();
22261
22262 SDValue Acc = N->getOperand(1);
22263 EVT ReducedVT = N->getValueType(0);
22264 EVT MulSrcVT = MulOpLHS.getValueType();
22265
22266 // Dot products operate on chunks of four elements so there must be four times
22267 // as many elements in the wide type
22268 if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
22269 !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
22270 !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
22271 !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
22272 !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
22273 !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
22274 return SDValue();
22275
22276 // If the extensions are mixed, we should lower it to a usdot instead
22277 unsigned Opcode = 0;
22278 if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
22279 if (!Subtarget->hasMatMulInt8())
22280 return SDValue();
22281
22282 bool Scalable = N->getValueType(0).isScalableVT();
22283 // There's no nxv2i64 version of usdot
22284 if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
22285 return SDValue();
22286
22287 Opcode = AArch64ISD::USDOT;
22288 // USDOT expects the signed operand to be last
22289 if (!MulOpRHSIsSigned)
22290 std::swap(MulOpLHS, MulOpRHS);
22291 } else
22292 Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
22293
22294 // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
22295 // product followed by a zero / sign extension
22296 if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
22297 (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
22298 EVT ReducedVTI32 =
22299 (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
22300
22301 SDValue DotI32 =
22302 DAG.getNode(Opcode, DL, ReducedVTI32,
22303 DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
22304 SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
22305 return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
22306 }
22307
22308 return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
22309}
22310
22312 const AArch64Subtarget *Subtarget,
22313 SelectionDAG &DAG) {
22314
22315 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
22316 getIntrinsicID(N) ==
22317 Intrinsic::experimental_vector_partial_reduce_add &&
22318 "Expected a partial reduction node");
22319
22320 if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
22321 return SDValue();
22322
22323 SDLoc DL(N);
22324
22325 if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
22326 return SDValue();
22327 SDValue Acc = N->getOperand(1);
22328 SDValue Ext = N->getOperand(2);
22329 EVT AccVT = Acc.getValueType();
22330 EVT ExtVT = Ext.getValueType();
22331 if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
22332 return SDValue();
22333
22334 SDValue ExtOp = Ext->getOperand(0);
22335 EVT ExtOpVT = ExtOp.getValueType();
22336
22337 if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
22338 !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
22339 !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
22340 return SDValue();
22341
22342 bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
22343 unsigned BottomOpcode =
22344 ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
22345 unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
22346 SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
22347 return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
22348}
22349
22350static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22351 SDLoc DL(N);
22352 EVT VT = N->getValueType(0);
22353 SDValue Op1 = N->getOperand(1);
22354 SDValue Op2 = N->getOperand(2);
22355 SDValue Op3 = N->getOperand(3);
22356
22357 switch (IID) {
22358 default:
22359 llvm_unreachable("Called with wrong intrinsic!");
22360 case Intrinsic::aarch64_sve_bsl:
22361 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
22362 case Intrinsic::aarch64_sve_bsl1n:
22363 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
22364 Op2);
22365 case Intrinsic::aarch64_sve_bsl2n:
22366 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
22367 DAG.getNOT(DL, Op2, VT));
22368 case Intrinsic::aarch64_sve_nbsl:
22369 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
22370 VT);
22371 }
22372}
22373
22376 const AArch64Subtarget *Subtarget) {
22377 SelectionDAG &DAG = DCI.DAG;
22378 unsigned IID = getIntrinsicID(N);
22379 switch (IID) {
22380 default:
22381 break;
22382 case Intrinsic::experimental_vector_partial_reduce_add: {
22383 if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
22384 return Dot;
22385 if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
22386 return WideAdd;
22387 SDLoc DL(N);
22388 SDValue Input = N->getOperand(2);
22389 return DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, N->getValueType(0),
22390 N->getOperand(1), Input,
22391 DAG.getConstant(1, DL, Input.getValueType()));
22392 }
22393 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22394 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22395 return tryCombineFixedPointConvert(N, DCI, DAG);
22396 case Intrinsic::aarch64_neon_saddv:
22397 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
22398 case Intrinsic::aarch64_neon_uaddv:
22399 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
22400 case Intrinsic::aarch64_neon_sminv:
22401 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
22402 case Intrinsic::aarch64_neon_uminv:
22403 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
22404 case Intrinsic::aarch64_neon_smaxv:
22405 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
22406 case Intrinsic::aarch64_neon_umaxv:
22407 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
22408 case Intrinsic::aarch64_neon_fmax:
22409 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22410 N->getOperand(1), N->getOperand(2));
22411 case Intrinsic::aarch64_neon_fmin:
22412 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22413 N->getOperand(1), N->getOperand(2));
22414 case Intrinsic::aarch64_neon_fmaxnm:
22415 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22416 N->getOperand(1), N->getOperand(2));
22417 case Intrinsic::aarch64_neon_fminnm:
22418 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22419 N->getOperand(1), N->getOperand(2));
22420 case Intrinsic::aarch64_neon_smull:
22421 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22422 N->getOperand(1), N->getOperand(2));
22423 case Intrinsic::aarch64_neon_umull:
22424 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22425 N->getOperand(1), N->getOperand(2));
22426 case Intrinsic::aarch64_neon_pmull:
22427 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22428 N->getOperand(1), N->getOperand(2));
22429 case Intrinsic::aarch64_neon_sqdmull:
22430 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22431 case Intrinsic::aarch64_neon_sqshl:
22432 case Intrinsic::aarch64_neon_uqshl:
22433 case Intrinsic::aarch64_neon_sqshlu:
22434 case Intrinsic::aarch64_neon_srshl:
22435 case Intrinsic::aarch64_neon_urshl:
22436 case Intrinsic::aarch64_neon_sshl:
22437 case Intrinsic::aarch64_neon_ushl:
22438 return tryCombineShiftImm(IID, N, DAG);
22439 case Intrinsic::aarch64_neon_sabd:
22440 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22441 N->getOperand(1), N->getOperand(2));
22442 case Intrinsic::aarch64_neon_uabd:
22443 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22444 N->getOperand(1), N->getOperand(2));
22445 case Intrinsic::aarch64_crc32b:
22446 case Intrinsic::aarch64_crc32cb:
22447 return tryCombineCRC32(0xff, N, DAG);
22448 case Intrinsic::aarch64_crc32h:
22449 case Intrinsic::aarch64_crc32ch:
22450 return tryCombineCRC32(0xffff, N, DAG);
22451 case Intrinsic::aarch64_sve_saddv:
22452 // There is no i64 version of SADDV because the sign is irrelevant.
22453 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
22454 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22455 else
22456 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22457 case Intrinsic::aarch64_sve_uaddv:
22458 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22459 case Intrinsic::aarch64_sve_smaxv:
22460 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22461 case Intrinsic::aarch64_sve_umaxv:
22462 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22463 case Intrinsic::aarch64_sve_sminv:
22464 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22465 case Intrinsic::aarch64_sve_uminv:
22466 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22467 case Intrinsic::aarch64_sve_orv:
22468 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22469 case Intrinsic::aarch64_sve_eorv:
22470 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22471 case Intrinsic::aarch64_sve_andv:
22472 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22473 case Intrinsic::aarch64_sve_index:
22474 return LowerSVEIntrinsicIndex(N, DAG);
22475 case Intrinsic::aarch64_sve_dup:
22476 return LowerSVEIntrinsicDUP(N, DAG);
22477 case Intrinsic::aarch64_sve_dup_x:
22478 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22479 N->getOperand(1));
22480 case Intrinsic::aarch64_sve_ext:
22481 return LowerSVEIntrinsicEXT(N, DAG);
22482 case Intrinsic::aarch64_sve_mul_u:
22483 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22484 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22485 case Intrinsic::aarch64_sve_smulh_u:
22486 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22487 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22488 case Intrinsic::aarch64_sve_umulh_u:
22489 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22490 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22491 case Intrinsic::aarch64_sve_smin_u:
22492 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22493 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22494 case Intrinsic::aarch64_sve_umin_u:
22495 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22496 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22497 case Intrinsic::aarch64_sve_smax_u:
22498 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22499 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22500 case Intrinsic::aarch64_sve_umax_u:
22501 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22502 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22503 case Intrinsic::aarch64_sve_lsl_u:
22504 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22505 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22506 case Intrinsic::aarch64_sve_lsr_u:
22507 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22508 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22509 case Intrinsic::aarch64_sve_asr_u:
22510 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22511 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22512 case Intrinsic::aarch64_sve_fadd_u:
22513 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22514 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22515 case Intrinsic::aarch64_sve_fdiv_u:
22516 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22517 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22518 case Intrinsic::aarch64_sve_fmax_u:
22519 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22520 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22521 case Intrinsic::aarch64_sve_fmaxnm_u:
22522 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22523 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22524 case Intrinsic::aarch64_sve_fmla_u:
22525 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22526 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22527 N->getOperand(2));
22528 case Intrinsic::aarch64_sve_fmin_u:
22529 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22530 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22531 case Intrinsic::aarch64_sve_fminnm_u:
22532 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22533 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22534 case Intrinsic::aarch64_sve_fmul_u:
22535 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22536 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22537 case Intrinsic::aarch64_sve_fsub_u:
22538 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22539 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22540 case Intrinsic::aarch64_sve_add_u:
22541 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22542 N->getOperand(3));
22543 case Intrinsic::aarch64_sve_sub_u:
22544 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22545 N->getOperand(3));
22546 case Intrinsic::aarch64_sve_subr:
22547 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22548 case Intrinsic::aarch64_sve_and_u:
22549 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22550 N->getOperand(3));
22551 case Intrinsic::aarch64_sve_bic_u:
22552 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22553 N->getOperand(2), N->getOperand(3));
22554 case Intrinsic::aarch64_sve_saddwb:
22555 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22556 N->getOperand(1), N->getOperand(2));
22557 case Intrinsic::aarch64_sve_saddwt:
22558 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22559 N->getOperand(1), N->getOperand(2));
22560 case Intrinsic::aarch64_sve_uaddwb:
22561 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22562 N->getOperand(1), N->getOperand(2));
22563 case Intrinsic::aarch64_sve_uaddwt:
22564 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22565 N->getOperand(1), N->getOperand(2));
22566 case Intrinsic::aarch64_sve_eor_u:
22567 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22568 N->getOperand(3));
22569 case Intrinsic::aarch64_sve_orr_u:
22570 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22571 N->getOperand(3));
22572 case Intrinsic::aarch64_sve_sabd_u:
22573 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22574 N->getOperand(2), N->getOperand(3));
22575 case Intrinsic::aarch64_sve_uabd_u:
22576 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22577 N->getOperand(2), N->getOperand(3));
22578 case Intrinsic::aarch64_sve_sdiv_u:
22579 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22580 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22581 case Intrinsic::aarch64_sve_udiv_u:
22582 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22583 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22584 case Intrinsic::aarch64_sve_sqadd:
22585 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22586 case Intrinsic::aarch64_sve_sqsub_u:
22587 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22588 N->getOperand(2), N->getOperand(3));
22589 case Intrinsic::aarch64_sve_uqadd:
22590 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22591 case Intrinsic::aarch64_sve_uqsub_u:
22592 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22593 N->getOperand(2), N->getOperand(3));
22594 case Intrinsic::aarch64_sve_sqadd_x:
22595 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22596 N->getOperand(1), N->getOperand(2));
22597 case Intrinsic::aarch64_sve_sqsub_x:
22598 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22599 N->getOperand(1), N->getOperand(2));
22600 case Intrinsic::aarch64_sve_uqadd_x:
22601 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22602 N->getOperand(1), N->getOperand(2));
22603 case Intrinsic::aarch64_sve_uqsub_x:
22604 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22605 N->getOperand(1), N->getOperand(2));
22606 case Intrinsic::aarch64_sve_asrd:
22607 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22608 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22609 case Intrinsic::aarch64_sve_cmphs:
22610 if (!N->getOperand(2).getValueType().isFloatingPoint())
22611 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22612 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22613 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22614 break;
22615 case Intrinsic::aarch64_sve_cmphi:
22616 if (!N->getOperand(2).getValueType().isFloatingPoint())
22617 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22618 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22619 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22620 break;
22621 case Intrinsic::aarch64_sve_fcmpge:
22622 case Intrinsic::aarch64_sve_cmpge:
22623 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22624 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22625 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22626 break;
22627 case Intrinsic::aarch64_sve_fcmpgt:
22628 case Intrinsic::aarch64_sve_cmpgt:
22629 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22630 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22631 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22632 break;
22633 case Intrinsic::aarch64_sve_fcmpeq:
22634 case Intrinsic::aarch64_sve_cmpeq:
22635 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22636 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22637 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22638 break;
22639 case Intrinsic::aarch64_sve_fcmpne:
22640 case Intrinsic::aarch64_sve_cmpne:
22641 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22642 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22643 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22644 break;
22645 case Intrinsic::aarch64_sve_fcmpuo:
22646 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22647 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22648 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22649 break;
22650 case Intrinsic::aarch64_sve_fadda:
22651 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22652 case Intrinsic::aarch64_sve_faddv:
22653 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22654 case Intrinsic::aarch64_sve_fmaxnmv:
22655 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22656 case Intrinsic::aarch64_sve_fmaxv:
22657 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22658 case Intrinsic::aarch64_sve_fminnmv:
22659 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22660 case Intrinsic::aarch64_sve_fminv:
22661 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22662 case Intrinsic::aarch64_sve_sel:
22663 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22664 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22665 case Intrinsic::aarch64_sve_cmpeq_wide:
22666 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22667 case Intrinsic::aarch64_sve_cmpne_wide:
22668 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22669 case Intrinsic::aarch64_sve_cmpge_wide:
22670 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22671 case Intrinsic::aarch64_sve_cmpgt_wide:
22672 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22673 case Intrinsic::aarch64_sve_cmplt_wide:
22674 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22675 case Intrinsic::aarch64_sve_cmple_wide:
22676 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22677 case Intrinsic::aarch64_sve_cmphs_wide:
22678 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22679 case Intrinsic::aarch64_sve_cmphi_wide:
22680 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22681 case Intrinsic::aarch64_sve_cmplo_wide:
22682 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22683 case Intrinsic::aarch64_sve_cmpls_wide:
22684 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22685 case Intrinsic::aarch64_sve_ptest_any:
22686 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22688 case Intrinsic::aarch64_sve_ptest_first:
22689 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22691 case Intrinsic::aarch64_sve_ptest_last:
22692 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22694 case Intrinsic::aarch64_sve_whilelo:
22695 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
22696 N->getOperand(1), N->getOperand(2));
22697 case Intrinsic::aarch64_sve_bsl:
22698 case Intrinsic::aarch64_sve_bsl1n:
22699 case Intrinsic::aarch64_sve_bsl2n:
22700 case Intrinsic::aarch64_sve_nbsl:
22701 return combineSVEBitSel(IID, N, DAG);
22702 }
22703 return SDValue();
22704}
22705
22706static bool isCheapToExtend(const SDValue &N) {
22707 unsigned OC = N->getOpcode();
22708 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22710}
22711
22712static SDValue
22714 SelectionDAG &DAG) {
22715 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22716 // we can move the sext into the arguments and have the same result. For
22717 // example, if A and B are both loads, we can make those extending loads and
22718 // avoid an extra instruction. This pattern appears often in VLS code
22719 // generation where the inputs to the setcc have a different size to the
22720 // instruction that wants to use the result of the setcc.
22721 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22722 N->getOperand(0)->getOpcode() == ISD::SETCC);
22723 const SDValue SetCC = N->getOperand(0);
22724
22725 const SDValue CCOp0 = SetCC.getOperand(0);
22726 const SDValue CCOp1 = SetCC.getOperand(1);
22727 if (!CCOp0->getValueType(0).isInteger() ||
22728 !CCOp1->getValueType(0).isInteger())
22729 return SDValue();
22730
22731 ISD::CondCode Code =
22732 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22733
22734 ISD::NodeType ExtType =
22735 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22736
22737 if (isCheapToExtend(SetCC.getOperand(0)) &&
22738 isCheapToExtend(SetCC.getOperand(1))) {
22739 const SDValue Ext1 =
22740 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22741 const SDValue Ext2 =
22742 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
22743
22744 return DAG.getSetCC(
22745 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
22746 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
22747 }
22748
22749 return SDValue();
22750}
22751
22752// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22753// This comes from interleaved vectorization. It is performed late to capture
22754// uitofp converts too.
22756 SelectionDAG &DAG) {
22757 EVT VT = N->getValueType(0);
22758 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22759 N->getOpcode() != ISD::ZERO_EXTEND ||
22760 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22761 return SDValue();
22762
22763 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
22764 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22765 return SDValue();
22766
22767 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
22768 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
22769 if (!Shuffle ||
22770 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22771 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22772 return SDValue();
22773
22774 unsigned Idx;
22776 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
22777 // An undef interleave shuffle can come up after other canonicalizations,
22778 // where the shuffle has been converted to
22779 // zext(extract(shuffle b, undef, [u,u,0,4]))
22780 bool IsUndefDeInterleave = false;
22781 if (!IsDeInterleave)
22782 IsUndefDeInterleave =
22783 Shuffle->getOperand(1).isUndef() &&
22784 all_of(
22785 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
22786 [](int M) { return M < 0; }) &&
22788 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
22789 VT.getVectorNumElements() / 2),
22790 4, Idx);
22791 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22792 return SDValue();
22793 SDLoc DL(N);
22794 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22795 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
22796 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22797 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
22798 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
22799 VT, BC1, BC2);
22800 if ((Idx & 1) == 1)
22801 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
22802 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
22803 return DAG.getNode(
22804 ISD::AND, DL, VT, UZP,
22805 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22806}
22807
22808// This comes up similar to the above when lowering deinterleaving shuffles from
22809// zexts. We have legalized the operations in the generally case to
22810// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22811// the extract is to the low half and the uzp is uzp1. There would be an extra
22812// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22813// there could also be an existing and / shift that can be combined in, either
22814// before of after the extract.
22816 EVT VT = N->getValueType(0);
22817 if (N->getOpcode() != ISD::ZERO_EXTEND ||
22818 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22819 return SDValue();
22820
22821 SDValue Op = N->getOperand(0);
22822 unsigned ExtOffset = (unsigned)-1;
22823 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22824 ExtOffset = Op.getConstantOperandVal(1);
22825 Op = Op.getOperand(0);
22826 }
22827
22828 unsigned Shift = 0;
22830 Op.getValueType().getScalarSizeInBits());
22831
22832 if (Op.getOpcode() == AArch64ISD::VLSHR) {
22833 Shift = Op.getConstantOperandVal(1);
22834 Op = Op.getOperand(0);
22835 Mask = Mask.lshr(Shift);
22836 }
22837 if (Op.getOpcode() == ISD::AND &&
22838 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
22839 Op = Op.getOperand(0);
22840 Mask = Mask.zext(VT.getScalarSizeInBits());
22841 } else if (Op.getOpcode() == AArch64ISD::BICi) {
22842 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
22843 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
22844 Mask = Mask.zext(VT.getScalarSizeInBits());
22845 Op = Op.getOperand(0);
22846 }
22847
22848 if (ExtOffset == (unsigned)-1) {
22849 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22850 ExtOffset = Op.getConstantOperandVal(1);
22851 Op = Op.getOperand(0);
22852 } else
22853 return SDValue();
22854 }
22855 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22856 return SDValue();
22857
22858 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
22859 return SDValue();
22860 if (Op.getOpcode() == AArch64ISD::UZP2)
22861 Shift += VT.getScalarSizeInBits() / 2;
22862
22863 SDLoc DL(N);
22864 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22865 Op.getOperand(ExtOffset == 0 ? 0 : 1));
22866 if (Shift != 0)
22867 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
22868 DAG.getConstant(Shift, DL, MVT::i32));
22869 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
22870}
22871
22874 SelectionDAG &DAG) {
22875 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
22876 // we can convert that DUP into another extract_high (of a bigger DUP), which
22877 // helps the backend to decide that an sabdl2 would be useful, saving a real
22878 // extract_high operation.
22879 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
22880 N->getOperand(0).getValueType().is64BitVector() &&
22881 (N->getOperand(0).getOpcode() == ISD::ABDU ||
22882 N->getOperand(0).getOpcode() == ISD::ABDS)) {
22883 SDNode *ABDNode = N->getOperand(0).getNode();
22884 SDValue NewABD =
22886 if (!NewABD.getNode())
22887 return SDValue();
22888
22889 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
22890 }
22891
22893 return R;
22894 if (SDValue R = performZExtUZPCombine(N, DAG))
22895 return R;
22896
22897 if (N->getValueType(0).isFixedLengthVector() &&
22898 N->getOpcode() == ISD::SIGN_EXTEND &&
22899 N->getOperand(0)->getOpcode() == ISD::SETCC)
22900 return performSignExtendSetCCCombine(N, DCI, DAG);
22901
22902 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22903 // that the top half of the result register must be unused, due to the
22904 // any_extend. This means that we can replace this pattern with (rev16
22905 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22906 // ...)), which is what this pattern would otherwise be lowered to.
22907 // Only apply this optimisation if any_extend in original pattern to i32 or
22908 // i64, because this type will become the input type to REV16 in the new
22909 // pattern, so must be a legitimate REV16 input type.
22910 SDValue Bswap = N->getOperand(0);
22911 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22912 Bswap.getValueType() == MVT::i16 &&
22913 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
22914 SDLoc DL(N);
22915 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
22916 Bswap->getOperand(0));
22917 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
22918 NewAnyExtend);
22919 }
22920
22921 return SDValue();
22922}
22923
22925 SDValue SplatVal, unsigned NumVecElts) {
22926 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
22927 Align OrigAlignment = St.getAlign();
22928 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
22929
22930 // Create scalar stores. This is at least as good as the code sequence for a
22931 // split unaligned store which is a dup.s, ext.b, and two stores.
22932 // Most of the time the three stores should be replaced by store pair
22933 // instructions (stp).
22934 SDLoc DL(&St);
22935 SDValue BasePtr = St.getBasePtr();
22936 uint64_t BaseOffset = 0;
22937
22938 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
22939 SDValue NewST1 =
22940 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
22941 OrigAlignment, St.getMemOperand()->getFlags());
22942
22943 // As this in ISel, we will not merge this add which may degrade results.
22944 if (BasePtr->getOpcode() == ISD::ADD &&
22945 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
22946 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
22947 BasePtr = BasePtr->getOperand(0);
22948 }
22949
22950 unsigned Offset = EltOffset;
22951 while (--NumVecElts) {
22952 Align Alignment = commonAlignment(OrigAlignment, Offset);
22953 SDValue OffsetPtr =
22954 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22955 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
22956 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
22957 PtrInfo.getWithOffset(Offset), Alignment,
22958 St.getMemOperand()->getFlags());
22959 Offset += EltOffset;
22960 }
22961 return NewST1;
22962}
22963
22964// Returns an SVE type that ContentTy can be trivially sign or zero extended
22965// into.
22966static MVT getSVEContainerType(EVT ContentTy) {
22967 assert(ContentTy.isSimple() && "No SVE containers for extended types");
22968
22969 switch (ContentTy.getSimpleVT().SimpleTy) {
22970 default:
22971 llvm_unreachable("No known SVE container for this MVT type");
22972 case MVT::nxv2i8:
22973 case MVT::nxv2i16:
22974 case MVT::nxv2i32:
22975 case MVT::nxv2i64:
22976 case MVT::nxv2f32:
22977 case MVT::nxv2f64:
22978 return MVT::nxv2i64;
22979 case MVT::nxv4i8:
22980 case MVT::nxv4i16:
22981 case MVT::nxv4i32:
22982 case MVT::nxv4f32:
22983 return MVT::nxv4i32;
22984 case MVT::nxv8i8:
22985 case MVT::nxv8i16:
22986 case MVT::nxv8f16:
22987 case MVT::nxv8bf16:
22988 return MVT::nxv8i16;
22989 case MVT::nxv16i8:
22990 return MVT::nxv16i8;
22991 }
22992}
22993
22995 SDLoc DL(N);
22996 EVT VT = N->getValueType(0);
22997
22999 return SDValue();
23000
23001 EVT ContainerVT = VT;
23002 if (ContainerVT.isInteger())
23003 ContainerVT = getSVEContainerType(ContainerVT);
23004
23005 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23006 SDValue Ops[] = { N->getOperand(0), // Chain
23007 N->getOperand(2), // Pg
23008 N->getOperand(3), // Base
23009 DAG.getValueType(VT) };
23010
23011 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23012 SDValue LoadChain = SDValue(Load.getNode(), 1);
23013
23014 if (ContainerVT.isInteger() && (VT != ContainerVT))
23015 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23016
23017 return DAG.getMergeValues({ Load, LoadChain }, DL);
23018}
23019
23021 SDLoc DL(N);
23022 EVT VT = N->getValueType(0);
23023 EVT PtrTy = N->getOperand(3).getValueType();
23024
23025 EVT LoadVT = VT;
23026 if (VT.isFloatingPoint())
23027 LoadVT = VT.changeTypeToInteger();
23028
23029 auto *MINode = cast<MemIntrinsicSDNode>(N);
23030 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23031 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23032 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23033 MINode->getOperand(2), PassThru,
23034 MINode->getMemoryVT(), MINode->getMemOperand(),
23036
23037 if (VT.isFloatingPoint()) {
23038 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23039 return DAG.getMergeValues(Ops, DL);
23040 }
23041
23042 return L;
23043}
23044
23045template <unsigned Opcode>
23047 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23048 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23049 "Unsupported opcode.");
23050 SDLoc DL(N);
23051 EVT VT = N->getValueType(0);
23052
23053 EVT LoadVT = VT;
23054 if (VT.isFloatingPoint())
23055 LoadVT = VT.changeTypeToInteger();
23056
23057 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23058 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23059 SDValue LoadChain = SDValue(Load.getNode(), 1);
23060
23061 if (VT.isFloatingPoint())
23062 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23063
23064 return DAG.getMergeValues({Load, LoadChain}, DL);
23065}
23066
23068 SDLoc DL(N);
23069 SDValue Data = N->getOperand(2);
23070 EVT DataVT = Data.getValueType();
23071 EVT HwSrcVt = getSVEContainerType(DataVT);
23072 SDValue InputVT = DAG.getValueType(DataVT);
23073
23074 if (DataVT.isFloatingPoint())
23075 InputVT = DAG.getValueType(HwSrcVt);
23076
23077 SDValue SrcNew;
23078 if (Data.getValueType().isFloatingPoint())
23079 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23080 else
23081 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23082
23083 SDValue Ops[] = { N->getOperand(0), // Chain
23084 SrcNew,
23085 N->getOperand(4), // Base
23086 N->getOperand(3), // Pg
23087 InputVT
23088 };
23089
23090 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23091}
23092
23094 SDLoc DL(N);
23095
23096 SDValue Data = N->getOperand(2);
23097 EVT DataVT = Data.getValueType();
23098 EVT PtrTy = N->getOperand(4).getValueType();
23099
23100 if (DataVT.isFloatingPoint())
23102
23103 auto *MINode = cast<MemIntrinsicSDNode>(N);
23104 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
23105 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
23106 MINode->getMemoryVT(), MINode->getMemOperand(),
23107 ISD::UNINDEXED, false, false);
23108}
23109
23110/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
23111/// load store optimizer pass will merge them to store pair stores. This should
23112/// be better than a movi to create the vector zero followed by a vector store
23113/// if the zero constant is not re-used, since one instructions and one register
23114/// live range will be removed.
23115///
23116/// For example, the final generated code should be:
23117///
23118/// stp xzr, xzr, [x0]
23119///
23120/// instead of:
23121///
23122/// movi v0.2d, #0
23123/// str q0, [x0]
23124///
23126 SDValue StVal = St.getValue();
23127 EVT VT = StVal.getValueType();
23128
23129 // Avoid scalarizing zero splat stores for scalable vectors.
23130 if (VT.isScalableVector())
23131 return SDValue();
23132
23133 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
23134 // 2, 3 or 4 i32 elements.
23135 int NumVecElts = VT.getVectorNumElements();
23136 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
23137 VT.getVectorElementType().getSizeInBits() == 64) ||
23138 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
23139 VT.getVectorElementType().getSizeInBits() == 32)))
23140 return SDValue();
23141
23142 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
23143 return SDValue();
23144
23145 // If the zero constant has more than one use then the vector store could be
23146 // better since the constant mov will be amortized and stp q instructions
23147 // should be able to be formed.
23148 if (!StVal.hasOneUse())
23149 return SDValue();
23150
23151 // If the store is truncating then it's going down to i16 or smaller, which
23152 // means it can be implemented in a single store anyway.
23153 if (St.isTruncatingStore())
23154 return SDValue();
23155
23156 // If the immediate offset of the address operand is too large for the stp
23157 // instruction, then bail out.
23158 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
23159 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
23160 if (Offset < -512 || Offset > 504)
23161 return SDValue();
23162 }
23163
23164 for (int I = 0; I < NumVecElts; ++I) {
23165 SDValue EltVal = StVal.getOperand(I);
23166 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
23167 return SDValue();
23168 }
23169
23170 // Use a CopyFromReg WZR/XZR here to prevent
23171 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
23172 SDLoc DL(&St);
23173 unsigned ZeroReg;
23174 EVT ZeroVT;
23175 if (VT.getVectorElementType().getSizeInBits() == 32) {
23176 ZeroReg = AArch64::WZR;
23177 ZeroVT = MVT::i32;
23178 } else {
23179 ZeroReg = AArch64::XZR;
23180 ZeroVT = MVT::i64;
23181 }
23182 SDValue SplatVal =
23183 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
23184 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23185}
23186
23187/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
23188/// value. The load store optimizer pass will merge them to store pair stores.
23189/// This has better performance than a splat of the scalar followed by a split
23190/// vector store. Even if the stores are not merged it is four stores vs a dup,
23191/// followed by an ext.b and two stores.
23193 SDValue StVal = St.getValue();
23194 EVT VT = StVal.getValueType();
23195
23196 // Don't replace floating point stores, they possibly won't be transformed to
23197 // stp because of the store pair suppress pass.
23198 if (VT.isFloatingPoint())
23199 return SDValue();
23200
23201 // We can express a splat as store pair(s) for 2 or 4 elements.
23202 unsigned NumVecElts = VT.getVectorNumElements();
23203 if (NumVecElts != 4 && NumVecElts != 2)
23204 return SDValue();
23205
23206 // If the store is truncating then it's going down to i16 or smaller, which
23207 // means it can be implemented in a single store anyway.
23208 if (St.isTruncatingStore())
23209 return SDValue();
23210
23211 // Check that this is a splat.
23212 // Make sure that each of the relevant vector element locations are inserted
23213 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
23214 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
23215 SDValue SplatVal;
23216 for (unsigned I = 0; I < NumVecElts; ++I) {
23217 // Check for insert vector elements.
23218 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
23219 return SDValue();
23220
23221 // Check that same value is inserted at each vector element.
23222 if (I == 0)
23223 SplatVal = StVal.getOperand(1);
23224 else if (StVal.getOperand(1) != SplatVal)
23225 return SDValue();
23226
23227 // Check insert element index.
23228 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
23229 if (!CIndex)
23230 return SDValue();
23231 uint64_t IndexVal = CIndex->getZExtValue();
23232 if (IndexVal >= NumVecElts)
23233 return SDValue();
23234 IndexNotInserted.reset(IndexVal);
23235
23236 StVal = StVal.getOperand(0);
23237 }
23238 // Check that all vector element locations were inserted to.
23239 if (IndexNotInserted.any())
23240 return SDValue();
23241
23242 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23243}
23244
23246 SelectionDAG &DAG,
23247 const AArch64Subtarget *Subtarget) {
23248
23249 StoreSDNode *S = cast<StoreSDNode>(N);
23250 if (S->isVolatile() || S->isIndexed())
23251 return SDValue();
23252
23253 SDValue StVal = S->getValue();
23254 EVT VT = StVal.getValueType();
23255
23256 if (!VT.isFixedLengthVector())
23257 return SDValue();
23258
23259 // If we get a splat of zeros, convert this vector store to a store of
23260 // scalars. They will be merged into store pairs of xzr thereby removing one
23261 // instruction and one register.
23262 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
23263 return ReplacedZeroSplat;
23264
23265 // FIXME: The logic for deciding if an unaligned store should be split should
23266 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
23267 // a call to that function here.
23268
23269 if (!Subtarget->isMisaligned128StoreSlow())
23270 return SDValue();
23271
23272 // Don't split at -Oz.
23274 return SDValue();
23275
23276 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
23277 // those up regresses performance on micro-benchmarks and olden/bh.
23278 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
23279 return SDValue();
23280
23281 // Split unaligned 16B stores. They are terrible for performance.
23282 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
23283 // extensions can use this to mark that it does not want splitting to happen
23284 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
23285 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
23286 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
23287 S->getAlign() <= Align(2))
23288 return SDValue();
23289
23290 // If we get a splat of a scalar convert this vector store to a store of
23291 // scalars. They will be merged into store pairs thereby removing two
23292 // instructions.
23293 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
23294 return ReplacedSplat;
23295
23296 SDLoc DL(S);
23297
23298 // Split VT into two.
23299 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23300 unsigned NumElts = HalfVT.getVectorNumElements();
23301 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23302 DAG.getConstant(0, DL, MVT::i64));
23303 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23304 DAG.getConstant(NumElts, DL, MVT::i64));
23305 SDValue BasePtr = S->getBasePtr();
23306 SDValue NewST1 =
23307 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
23308 S->getAlign(), S->getMemOperand()->getFlags());
23309 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23310 DAG.getConstant(8, DL, MVT::i64));
23311 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
23312 S->getPointerInfo(), S->getAlign(),
23313 S->getMemOperand()->getFlags());
23314}
23315
23317 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
23318
23319 // splice(pg, op1, undef) -> op1
23320 if (N->getOperand(2).isUndef())
23321 return N->getOperand(1);
23322
23323 return SDValue();
23324}
23325
23327 const AArch64Subtarget *Subtarget) {
23328 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
23329 N->getOpcode() == AArch64ISD::UUNPKLO) &&
23330 "Unexpected Opcode!");
23331
23332 // uunpklo/hi undef -> undef
23333 if (N->getOperand(0).isUndef())
23334 return DAG.getUNDEF(N->getValueType(0));
23335
23336 // If this is a masked load followed by an UUNPKLO, fold this into a masked
23337 // extending load. We can do this even if this is already a masked
23338 // {z,}extload.
23339 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
23340 N->getOpcode() == AArch64ISD::UUNPKLO) {
23341 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
23342 SDValue Mask = MLD->getMask();
23343 SDLoc DL(N);
23344
23345 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
23346 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23347 (MLD->getPassThru()->isUndef() ||
23348 isZerosVector(MLD->getPassThru().getNode()))) {
23349 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23350 unsigned PgPattern = Mask->getConstantOperandVal(0);
23351 EVT VT = N->getValueType(0);
23352
23353 // Ensure we can double the size of the predicate pattern
23354 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23355 if (NumElts &&
23356 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
23357 Mask =
23358 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
23359 SDValue PassThru = DAG.getConstant(0, DL, VT);
23360 SDValue NewLoad = DAG.getMaskedLoad(
23361 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
23362 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
23364
23365 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
23366
23367 return NewLoad;
23368 }
23369 }
23370 }
23371
23372 return SDValue();
23373}
23374
23376 if (N->getOpcode() != AArch64ISD::UZP1)
23377 return false;
23378 SDValue Op0 = N->getOperand(0);
23379 EVT SrcVT = Op0->getValueType(0);
23380 EVT DstVT = N->getValueType(0);
23381 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23382 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23383 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23384}
23385
23386// Try to combine rounding shifts where the operands come from an extend, and
23387// the result is truncated and combined into one vector.
23388// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23390 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23391 SDValue Op0 = N->getOperand(0);
23392 SDValue Op1 = N->getOperand(1);
23393 EVT ResVT = N->getValueType(0);
23394
23395 unsigned RshOpc = Op0.getOpcode();
23396 if (RshOpc != AArch64ISD::RSHRNB_I)
23397 return SDValue();
23398
23399 // Same op code and imm value?
23400 SDValue ShiftValue = Op0.getOperand(1);
23401 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23402 return SDValue();
23403
23404 // Same unextended operand value?
23405 SDValue Lo = Op0.getOperand(0);
23406 SDValue Hi = Op1.getOperand(0);
23407 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23408 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23409 return SDValue();
23410 SDValue OrigArg = Lo.getOperand(0);
23411 if (OrigArg != Hi.getOperand(0))
23412 return SDValue();
23413
23414 SDLoc DL(N);
23415 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23416 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23417 ShiftValue);
23418}
23419
23420// Try to simplify:
23421// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23422// t2 = nxv8i16 srl(t1, ShiftValue)
23423// to
23424// t1 = nxv8i16 rshrnb(X, shiftvalue).
23425// rshrnb will zero the top half bits of each element. Therefore, this combine
23426// should only be performed when a following instruction with the rshrnb
23427// as an operand does not care about the top half of each element. For example,
23428// a uzp1 or a truncating store.
23430 const AArch64Subtarget *Subtarget) {
23431 EVT VT = Srl->getValueType(0);
23432 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23433 return SDValue();
23434
23435 EVT ResVT;
23436 if (VT == MVT::nxv8i16)
23437 ResVT = MVT::nxv16i8;
23438 else if (VT == MVT::nxv4i32)
23439 ResVT = MVT::nxv8i16;
23440 else if (VT == MVT::nxv2i64)
23441 ResVT = MVT::nxv4i32;
23442 else
23443 return SDValue();
23444
23445 SDLoc DL(Srl);
23446 unsigned ShiftValue;
23447 SDValue RShOperand;
23448 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23449 return SDValue();
23450 SDValue Rshrnb = DAG.getNode(
23451 AArch64ISD::RSHRNB_I, DL, ResVT,
23452 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23453 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23454}
23455
23457 if (V.getOpcode() != AArch64ISD::NVCAST)
23458 return SDValue();
23459
23460 SDValue Op = V.getOperand(0);
23461 if (!Op.getValueType().isVector() ||
23462 V.getValueType().getVectorElementCount() !=
23463 Op.getValueType().getVectorElementCount() * 2)
23464 return SDValue();
23465
23466 return Op;
23467}
23468
23470 const AArch64Subtarget *Subtarget) {
23471 SDLoc DL(N);
23472 SDValue Op0 = N->getOperand(0);
23473 SDValue Op1 = N->getOperand(1);
23474 EVT ResVT = N->getValueType(0);
23475
23476 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23477 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23479 Op0.getOperand(0) == Op1.getOperand(0)) {
23480
23481 SDValue SourceVec = Op0.getOperand(0);
23482 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23483 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23484 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23485 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23486 EVT OpVT = Op0.getOperand(1).getValueType();
23487 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23488 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23489 DAG.getUNDEF(WidenedResVT));
23490 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23491 DAG.getConstant(0, DL, OpVT));
23492 }
23493 }
23494
23495 // Following optimizations only work with uzp1.
23496 if (N->getOpcode() == AArch64ISD::UZP2)
23497 return SDValue();
23498
23499 // uzp1(x, undef) -> concat(truncate(x), undef)
23500 if (Op1.getOpcode() == ISD::UNDEF) {
23501 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23502 switch (ResVT.getSimpleVT().SimpleTy) {
23503 default:
23504 break;
23505 case MVT::v16i8:
23506 BCVT = MVT::v8i16;
23507 HalfVT = MVT::v8i8;
23508 break;
23509 case MVT::v8i16:
23510 BCVT = MVT::v4i32;
23511 HalfVT = MVT::v4i16;
23512 break;
23513 case MVT::v4i32:
23514 BCVT = MVT::v2i64;
23515 HalfVT = MVT::v2i32;
23516 break;
23517 }
23518 if (BCVT != MVT::Other) {
23519 SDValue BC = DAG.getBitcast(BCVT, Op0);
23520 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23521 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23522 DAG.getUNDEF(HalfVT));
23523 }
23524 }
23525
23526 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23527 return Urshr;
23528
23529 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23530 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23531 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23532 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23533 }
23534 }
23535
23536 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23537 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23538 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23539 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23540 }
23541 }
23542
23543 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23544 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23545 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23546 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23547 SDValue X = PreCast.getOperand(0).getOperand(0);
23548 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23549 }
23550 }
23551 }
23552
23553 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23554 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23555 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23556 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23557 SDValue Z = PreCast.getOperand(0).getOperand(1);
23558 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23559 }
23560 }
23561 }
23562
23563 // These optimizations only work on little endian.
23564 if (!DAG.getDataLayout().isLittleEndian())
23565 return SDValue();
23566
23567 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23568 // Example:
23569 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23570 // to
23571 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23573 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23574 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23575 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23576 Op1.getOperand(0));
23577 }
23578 }
23579
23580 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23581 return SDValue();
23582
23583 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23584 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23585
23586 // truncating uzp1(x, y) -> xtn(concat (x, y))
23587 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23588 EVT Op0Ty = SourceOp0.getValueType();
23589 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23590 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23591 SDValue Concat =
23594 SourceOp0, SourceOp1);
23595 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23596 }
23597 }
23598
23599 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23600 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23601 SourceOp1.getOpcode() != ISD::TRUNCATE)
23602 return SDValue();
23603 SourceOp0 = SourceOp0.getOperand(0);
23604 SourceOp1 = SourceOp1.getOperand(0);
23605
23606 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23607 !SourceOp0.getValueType().isSimple())
23608 return SDValue();
23609
23610 EVT ResultTy;
23611
23612 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23613 case MVT::v2i64:
23614 ResultTy = MVT::v4i32;
23615 break;
23616 case MVT::v4i32:
23617 ResultTy = MVT::v8i16;
23618 break;
23619 case MVT::v8i16:
23620 ResultTy = MVT::v16i8;
23621 break;
23622 default:
23623 return SDValue();
23624 }
23625
23626 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23627 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23628 SDValue UzpResult =
23629 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23630
23631 EVT BitcastResultTy;
23632
23633 switch (ResVT.getSimpleVT().SimpleTy) {
23634 case MVT::v2i32:
23635 BitcastResultTy = MVT::v2i64;
23636 break;
23637 case MVT::v4i16:
23638 BitcastResultTy = MVT::v4i32;
23639 break;
23640 case MVT::v8i8:
23641 BitcastResultTy = MVT::v8i16;
23642 break;
23643 default:
23644 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23645 }
23646
23647 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23648 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23649}
23650
23652 unsigned Opc = N->getOpcode();
23653
23654 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23655 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23656 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23657 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23658 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23659 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23660 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23661 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23662
23663 SDLoc DL(N);
23664 SDValue Chain = N->getOperand(0);
23665 SDValue Pg = N->getOperand(1);
23666 SDValue Base = N->getOperand(2);
23667 SDValue Offset = N->getOperand(3);
23668 SDValue Ty = N->getOperand(4);
23669
23670 EVT ResVT = N->getValueType(0);
23671
23672 const auto OffsetOpc = Offset.getOpcode();
23673 const bool OffsetIsZExt =
23674 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23675 const bool OffsetIsSExt =
23676 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23677
23678 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23679 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23680 SDValue ExtPg = Offset.getOperand(0);
23681 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23682 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23683
23684 // If the predicate for the sign- or zero-extended offset is the
23685 // same as the predicate used for this load and the sign-/zero-extension
23686 // was from a 32-bits...
23687 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23688 SDValue UnextendedOffset = Offset.getOperand(1);
23689
23690 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23691 if (Signed)
23692 NewOpc = getSignExtendedGatherOpcode(NewOpc);
23693
23694 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23695 {Chain, Pg, Base, UnextendedOffset, Ty});
23696 }
23697 }
23698
23699 return SDValue();
23700}
23701
23702/// Optimize a vector shift instruction and its operand if shifted out
23703/// bits are not used.
23705 const AArch64TargetLowering &TLI,
23707 assert(N->getOpcode() == AArch64ISD::VASHR ||
23708 N->getOpcode() == AArch64ISD::VLSHR);
23709
23710 SDValue Op = N->getOperand(0);
23711 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23712
23713 unsigned ShiftImm = N->getConstantOperandVal(1);
23714 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23715
23716 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23717 if (N->getOpcode() == AArch64ISD::VASHR &&
23718 Op.getOpcode() == AArch64ISD::VSHL &&
23719 N->getOperand(1) == Op.getOperand(1))
23720 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23721 return Op.getOperand(0);
23722
23723 // If the shift is exact, the shifted out bits matter.
23724 if (N->getFlags().hasExact())
23725 return SDValue();
23726
23727 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23728 APInt DemandedMask = ~ShiftedOutBits;
23729
23730 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23731 return SDValue(N, 0);
23732
23733 return SDValue();
23734}
23735
23737 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23738 // This transform works in partnership with performSetCCPunpkCombine to
23739 // remove unnecessary transfer of predicates into standard registers and back
23740 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23741 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23742 MVT::i1) {
23743 SDValue CC = N->getOperand(0)->getOperand(0);
23744 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
23745 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
23746 DAG.getVectorIdxConstant(0, SDLoc(N)));
23747 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
23748 }
23749
23750 return SDValue();
23751}
23752
23753/// Target-specific DAG combine function for post-increment LD1 (lane) and
23754/// post-increment LD1R.
23757 bool IsLaneOp) {
23758 if (DCI.isBeforeLegalizeOps())
23759 return SDValue();
23760
23761 SelectionDAG &DAG = DCI.DAG;
23762 EVT VT = N->getValueType(0);
23763
23764 if (!VT.is128BitVector() && !VT.is64BitVector())
23765 return SDValue();
23766
23767 // If it is not LOAD, can not do such combine.
23768 unsigned LoadIdx = IsLaneOp ? 1 : 0;
23769 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
23770 if (!LD)
23771 return SDValue();
23772
23773 // If the Generic combiner already helped form a pre- or post-indexed load,
23774 // skip forming one here.
23775 if (LD->isIndexed())
23776 return SDValue();
23777
23778 // The vector lane must be a constant in the LD1LANE opcode.
23779 SDValue Lane;
23780 if (IsLaneOp) {
23781 Lane = N->getOperand(2);
23782 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
23783 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23784 return SDValue();
23785 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
23786 return SDValue();
23787 }
23788
23789 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
23790 EVT MemVT = LoadSDN->getMemoryVT();
23791 // Check if memory operand is the same type as the vector element.
23792 if (MemVT != VT.getVectorElementType())
23793 return SDValue();
23794
23795 // Check if there are other uses. If so, do not combine as it will introduce
23796 // an extra load.
23797 for (SDUse &U : LD->uses()) {
23798 if (U.getResNo() == 1) // Ignore uses of the chain result.
23799 continue;
23800 if (U.getUser() != N)
23801 return SDValue();
23802 }
23803
23804 // If there is one use and it can splat the value, prefer that operation.
23805 // TODO: This could be expanded to more operations if they reliably use the
23806 // index variants.
23807 if (N->hasOneUse()) {
23808 unsigned UseOpc = N->user_begin()->getOpcode();
23809 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23810 return SDValue();
23811 }
23812
23813 SDValue Addr = LD->getOperand(1);
23814 SDValue Vector = N->getOperand(0);
23815 // Search for a use of the address operand that is an increment.
23816 for (SDUse &Use : Addr->uses()) {
23817 SDNode *User = Use.getUser();
23818 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23819 continue;
23820
23821 // If the increment is a constant, it must match the memory ref size.
23822 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23823 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23824 uint32_t IncVal = CInc->getZExtValue();
23825 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
23826 if (IncVal != NumBytes)
23827 continue;
23828 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23829 }
23830
23831 // To avoid cycle construction make sure that neither the load nor the add
23832 // are predecessors to each other or the Vector.
23835 Visited.insert(Addr.getNode());
23836 Worklist.push_back(User);
23837 Worklist.push_back(LD);
23838 Worklist.push_back(Vector.getNode());
23839 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
23840 SDNode::hasPredecessorHelper(User, Visited, Worklist))
23841 continue;
23842
23844 Ops.push_back(LD->getOperand(0)); // Chain
23845 if (IsLaneOp) {
23846 Ops.push_back(Vector); // The vector to be inserted
23847 Ops.push_back(Lane); // The lane to be inserted in the vector
23848 }
23849 Ops.push_back(Addr);
23850 Ops.push_back(Inc);
23851
23852 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
23853 SDVTList SDTys = DAG.getVTList(Tys);
23854 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
23855 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
23856 MemVT,
23857 LoadSDN->getMemOperand());
23858
23859 // Update the uses.
23860 SDValue NewResults[] = {
23861 SDValue(LD, 0), // The result of load
23862 SDValue(UpdN.getNode(), 2) // Chain
23863 };
23864 DCI.CombineTo(LD, NewResults);
23865 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
23866 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
23867
23868 break;
23869 }
23870 return SDValue();
23871}
23872
23873/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
23874/// address translation.
23877 SelectionDAG &DAG) {
23878 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
23879 KnownBits Known;
23881 !DCI.isBeforeLegalizeOps());
23882 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23883 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
23884 DCI.CommitTargetLoweringOpt(TLO);
23885 return true;
23886 }
23887 return false;
23888}
23889
23891 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
23892 "Expected STORE dag node in input!");
23893
23894 if (auto Store = dyn_cast<StoreSDNode>(N)) {
23895 if (!Store->isTruncatingStore() || Store->isIndexed())
23896 return SDValue();
23897 SDValue Ext = Store->getValue();
23898 auto ExtOpCode = Ext.getOpcode();
23899 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
23900 ExtOpCode != ISD::ANY_EXTEND)
23901 return SDValue();
23902 SDValue Orig = Ext->getOperand(0);
23903 if (Store->getMemoryVT() != Orig.getValueType())
23904 return SDValue();
23905 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
23906 Store->getBasePtr(), Store->getMemOperand());
23907 }
23908
23909 return SDValue();
23910}
23911
23912// A custom combine to lower load <3 x i8> as the more efficient sequence
23913// below:
23914// ldrb wX, [x0, #2]
23915// ldrh wY, [x0]
23916// orr wX, wY, wX, lsl #16
23917// fmov s0, wX
23918//
23919// Note that an alternative sequence with even fewer (although usually more
23920// complex/expensive) instructions would be:
23921// ld1r.4h { v0 }, [x0], #2
23922// ld1.b { v0 }[2], [x0]
23923//
23924// Generating this sequence unfortunately results in noticeably worse codegen
23925// for code that extends the loaded v3i8, due to legalization breaking vector
23926// shuffle detection in a way that is very difficult to work around.
23927// TODO: Revisit once v3i8 legalization has been improved in general.
23929 EVT MemVT = LD->getMemoryVT();
23930 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
23931 LD->getBaseAlign() >= 4)
23932 return SDValue();
23933
23934 SDLoc DL(LD);
23936 SDValue Chain = LD->getChain();
23937 SDValue BasePtr = LD->getBasePtr();
23938 MachineMemOperand *MMO = LD->getMemOperand();
23939 assert(LD->getOffset().isUndef() && "undef offset expected");
23940
23941 // Load 2 x i8, then 1 x i8.
23942 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
23943 TypeSize Offset2 = TypeSize::getFixed(2);
23944 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
23945 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
23946 MF.getMachineMemOperand(MMO, 2, 1));
23947
23948 // Extend to i32.
23949 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
23950 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
23951
23952 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
23953 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
23954 DAG.getConstant(16, DL, MVT::i32));
23955 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
23956 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
23957
23958 // Extract v3i8 again.
23959 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
23960 DAG.getConstant(0, DL, MVT::i64));
23961 SDValue TokenFactor = DAG.getNode(
23962 ISD::TokenFactor, DL, MVT::Other,
23963 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
23964 return DAG.getMergeValues({Extract, TokenFactor}, DL);
23965}
23966
23967// Perform TBI simplification if supported by the target and try to break up
23968// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23969// load instructions can be selected.
23972 SelectionDAG &DAG,
23973 const AArch64Subtarget *Subtarget) {
23974 if (Subtarget->supportsAddressTopByteIgnored())
23975 performTBISimplification(N->getOperand(1), DCI, DAG);
23976
23977 LoadSDNode *LD = cast<LoadSDNode>(N);
23978 EVT RegVT = LD->getValueType(0);
23979 EVT MemVT = LD->getMemoryVT();
23980 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23981 SDLoc DL(LD);
23982
23983 // Cast ptr32 and ptr64 pointers to the default address space before a load.
23984 unsigned AddrSpace = LD->getAddressSpace();
23985 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
23986 AddrSpace == ARM64AS::PTR32_UPTR) {
23987 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
23988 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
23989 SDValue Cast =
23990 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
23991 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
23992 Cast, LD->getPointerInfo(), MemVT,
23993 LD->getBaseAlign(),
23994 LD->getMemOperand()->getFlags());
23995 }
23996 }
23997
23998 if (LD->isVolatile() || !Subtarget->isLittleEndian())
23999 return SDValue(N, 0);
24000
24001 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24002 return Res;
24003
24004 if (!LD->isNonTemporal())
24005 return SDValue(N, 0);
24006
24007 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24008 MemVT.getSizeInBits() % 256 == 0 ||
24009 256 % MemVT.getScalarSizeInBits() != 0)
24010 return SDValue(N, 0);
24011
24012 SDValue Chain = LD->getChain();
24013 SDValue BasePtr = LD->getBasePtr();
24014 SDNodeFlags Flags = LD->getFlags();
24016 SmallVector<SDValue, 4> LoadOpsChain;
24017 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24018 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24019 // loads and reduce the amount of load instructions generated.
24020 MVT NewVT =
24022 256 / MemVT.getVectorElementType().getSizeInBits());
24023 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24024 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24025 for (unsigned I = 0; I < Num256Loads; I++) {
24026 unsigned PtrOffset = I * 32;
24027 SDValue NewPtr = DAG.getMemBasePlusOffset(
24028 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24029 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24030 SDValue NewLoad = DAG.getLoad(
24031 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24032 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24033 LoadOps.push_back(NewLoad);
24034 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24035 }
24036
24037 // Process remaining bits of the load operation.
24038 // This is done by creating an UNDEF vector to match the size of the
24039 // 256-bit loads and inserting the remaining load to it. We extract the
24040 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24041 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24042 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24043 MVT RemainingVT = MVT::getVectorVT(
24045 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24046 SDValue NewPtr = DAG.getMemBasePlusOffset(
24047 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24048 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24049 SDValue RemainingLoad =
24050 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24051 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24052 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24053 SDValue UndefVector = DAG.getUNDEF(NewVT);
24054 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24055 SDValue ExtendedRemainingLoad =
24056 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24057 {UndefVector, RemainingLoad, InsertIdx});
24058 LoadOps.push_back(ExtendedRemainingLoad);
24059 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24060 EVT ConcatVT =
24062 LoadOps.size() * NewVT.getVectorNumElements());
24063 SDValue ConcatVectors =
24064 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24065 // Extract the original vector type size.
24066 SDValue ExtractSubVector =
24067 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24068 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24069 SDValue TokenFactor =
24070 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24071 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24072}
24073
24075 EVT VecVT = Op.getValueType();
24076 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24077 "Need boolean vector type.");
24078
24079 if (Depth > 3)
24081
24082 // We can get the base type from a vector compare or truncate.
24083 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24084 return Op.getOperand(0).getValueType();
24085
24086 // If an operand is a bool vector, continue looking.
24088 for (SDValue Operand : Op->op_values()) {
24089 if (Operand.getValueType() != VecVT)
24090 continue;
24091
24092 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24093 if (!BaseVT.isSimple())
24094 BaseVT = OperandVT;
24095 else if (OperandVT != BaseVT)
24097 }
24098
24099 return BaseVT;
24100}
24101
24102// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
24103// iN, we can use a trick that extracts the i^th bit from the i^th element and
24104// then performs a vector add to get a scalar bitmask. This requires that each
24105// element's bits are either all 1 or all 0.
24107 SDLoc DL(N);
24108 SDValue ComparisonResult(N, 0);
24109 EVT VecVT = ComparisonResult.getValueType();
24110 assert(VecVT.isVector() && "Must be a vector type");
24111
24112 unsigned NumElts = VecVT.getVectorNumElements();
24113 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
24114 return SDValue();
24115
24116 if (VecVT.getVectorElementType() != MVT::i1 &&
24117 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
24118 return SDValue();
24119
24120 // If we can find the original types to work on instead of a vector of i1,
24121 // we can avoid extend/extract conversion instructions.
24122 if (VecVT.getVectorElementType() == MVT::i1) {
24123 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
24124 if (!VecVT.isSimple()) {
24125 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
24126 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
24127 }
24128 }
24129 VecVT = VecVT.changeVectorElementTypeToInteger();
24130
24131 // Large vectors don't map directly to this conversion, so to avoid too many
24132 // edge cases, we don't apply it here. The conversion will likely still be
24133 // applied later via multiple smaller vectors, whose results are concatenated.
24134 if (VecVT.getSizeInBits() > 128)
24135 return SDValue();
24136
24137 // Ensure that all elements' bits are either 0s or 1s.
24138 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
24139
24140 SmallVector<SDValue, 16> MaskConstants;
24142 VecVT == MVT::v16i8) {
24143 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
24144 // per entry. We split it into two halves, apply the mask, zip the halves to
24145 // create 8x 16-bit values, and the perform the vector reduce.
24146 for (unsigned Half = 0; Half < 2; ++Half) {
24147 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
24148 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
24149 }
24150 }
24151 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24152 SDValue RepresentativeBits =
24153 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24154
24155 SDValue UpperRepresentativeBits =
24156 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
24157 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
24158 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
24159 RepresentativeBits, UpperRepresentativeBits);
24160 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
24161 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
24162 }
24163
24164 // All other vector sizes.
24165 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
24166 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
24167 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
24168 }
24169
24170 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24171 SDValue RepresentativeBits =
24172 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24173 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
24174 NumElts, VecVT.getVectorElementType().getSizeInBits()));
24175 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
24176}
24177
24179 StoreSDNode *Store) {
24180 if (!Store->isTruncatingStore())
24181 return SDValue();
24182
24183 SDLoc DL(Store);
24184 SDValue VecOp = Store->getValue();
24185 EVT VT = VecOp.getValueType();
24186 EVT MemVT = Store->getMemoryVT();
24187
24188 if (!MemVT.isVector() || !VT.isVector() ||
24189 MemVT.getVectorElementType() != MVT::i1)
24190 return SDValue();
24191
24192 // If we are storing a vector that we are currently building, let
24193 // `scalarizeVectorStore()` handle this more efficiently.
24194 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
24195 return SDValue();
24196
24197 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
24198 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
24199 if (!VectorBits)
24200 return SDValue();
24201
24202 EVT StoreVT =
24204 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
24205 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
24206 Store->getMemOperand());
24207}
24208
24209// Combine store (fp_to_int X) to use vector semantics around the conversion
24210// when NEON is available. This allows us to store the in-vector result directly
24211// without transferring the result into a GPR in the process.
24214 SelectionDAG &DAG,
24215 const AArch64Subtarget *Subtarget) {
24216 // Limit to post-legalization in order to avoid peeling truncating stores.
24217 if (DCI.isBeforeLegalize())
24218 return SDValue();
24219 if (!Subtarget->isNeonAvailable())
24220 return SDValue();
24221 // Source operand is already a vector.
24222 SDValue Value = ST->getValue();
24223 if (Value.getValueType().isVector())
24224 return SDValue();
24225
24226 // Look through potential assertions.
24227 while (Value->isAssert())
24228 Value = Value.getOperand(0);
24229
24230 if (Value.getOpcode() != ISD::FP_TO_SINT &&
24231 Value.getOpcode() != ISD::FP_TO_UINT)
24232 return SDValue();
24233 if (!Value->hasOneUse())
24234 return SDValue();
24235
24236 SDValue FPSrc = Value.getOperand(0);
24237 EVT SrcVT = FPSrc.getValueType();
24238 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24239 return SDValue();
24240
24241 // No support for assignments such as i64 = fp_to_sint i32
24242 EVT VT = Value.getSimpleValueType();
24243 if (VT != SrcVT.changeTypeToInteger())
24244 return SDValue();
24245
24246 // Create a 128-bit element vector to avoid widening. The floating point
24247 // conversion is transformed into a single element conversion via a pattern.
24248 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24249 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24250 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24251 SDLoc DL(ST);
24252 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24253 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24254
24255 SDValue Zero = DAG.getVectorIdxConstant(0, DL);
24256 SDValue Extracted =
24257 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24258
24259 DCI.CombineTo(ST->getValue().getNode(), Extracted);
24260 return SDValue(ST, 0);
24261}
24262
24264 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
24265 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
24266 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
24267}
24268
24269// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
24271 const AArch64Subtarget *Subtarget) {
24272 SDValue Value = ST->getValue();
24273 EVT ValueVT = Value.getValueType();
24274
24275 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
24276 Value.getOpcode() != ISD::TRUNCATE ||
24277 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
24278 return SDValue();
24279
24280 assert(ST->getOffset().isUndef() && "undef offset expected");
24281 SDLoc DL(ST);
24282 auto WideVT = EVT::getVectorVT(
24283 *DAG.getContext(),
24284 Value->getOperand(0).getValueType().getVectorElementType(), 4);
24285 SDValue UndefVector = DAG.getUNDEF(WideVT);
24286 SDValue WideTrunc = DAG.getNode(
24287 ISD::INSERT_SUBVECTOR, DL, WideVT,
24288 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
24289 SDValue Cast = DAG.getNode(
24290 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
24291 WideTrunc);
24292
24294 SDValue Chain = ST->getChain();
24295 MachineMemOperand *MMO = ST->getMemOperand();
24296 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
24297 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24298 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
24299 TypeSize Offset2 = TypeSize::getFixed(2);
24300 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
24301 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
24302
24303 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24304 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
24305 TypeSize Offset1 = TypeSize::getFixed(1);
24306 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
24307 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
24308
24309 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24310 DAG.getConstant(0, DL, MVT::i64));
24311 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
24312 MF.getMachineMemOperand(MMO, 0, 1));
24313 return Chain;
24314}
24315
24316static unsigned getFPSubregForVT(EVT VT) {
24317 assert(VT.isSimple() && "Expected simple VT");
24318 switch (VT.getSimpleVT().SimpleTy) {
24319 case MVT::aarch64mfp8:
24320 return AArch64::bsub;
24321 case MVT::f16:
24322 return AArch64::hsub;
24323 case MVT::f32:
24324 return AArch64::ssub;
24325 case MVT::f64:
24326 return AArch64::dsub;
24327 default:
24328 llvm_unreachable("Unexpected VT!");
24329 }
24330}
24331
24334 SelectionDAG &DAG,
24335 const AArch64Subtarget *Subtarget) {
24336 StoreSDNode *ST = cast<StoreSDNode>(N);
24337 SDValue Chain = ST->getChain();
24338 SDValue Value = ST->getValue();
24339 SDValue Ptr = ST->getBasePtr();
24340 EVT ValueVT = Value.getValueType();
24341 EVT MemVT = ST->getMemoryVT();
24342 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24343 SDLoc DL(ST);
24344
24345 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24346 return Res;
24347
24348 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
24349 EVT EltVT = VT.getVectorElementType();
24350 return EltVT == MVT::f32 || EltVT == MVT::f64;
24351 };
24352
24353 // Cast ptr32 and ptr64 pointers to the default address space before a store.
24354 unsigned AddrSpace = ST->getAddressSpace();
24355 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24356 AddrSpace == ARM64AS::PTR32_UPTR) {
24357 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24358 if (PtrVT != Ptr.getSimpleValueType()) {
24359 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
24360 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
24361 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
24362 ST->getAAInfo());
24363 }
24364 }
24365
24366 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
24367 return Res;
24368
24369 // If this is an FP_ROUND followed by a store, fold this into a truncating
24370 // store. We can do this even if this is already a truncstore.
24371 // We purposefully don't care about legality of the nodes here as we know
24372 // they can be split down into something legal.
24373 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
24374 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
24375 Subtarget->useSVEForFixedLengthVectors() &&
24376 ValueVT.isFixedLengthVector() &&
24377 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
24378 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
24379 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
24380 ST->getMemOperand());
24381
24382 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
24383 return Split;
24384
24385 if (Subtarget->supportsAddressTopByteIgnored() &&
24386 performTBISimplification(N->getOperand(2), DCI, DAG))
24387 return SDValue(N, 0);
24388
24389 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
24390 return Store;
24391
24392 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
24393 return Store;
24394
24395 if (ST->isTruncatingStore() &&
24396 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
24397 if (SDValue Rshrnb =
24398 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
24399 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24400 MemVT, ST->getMemOperand());
24401 }
24402 }
24403
24404 // This is an integer vector_extract_elt followed by a (possibly truncating)
24405 // store. We may be able to replace this with a store of an FP subregister.
24406 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24407 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24408
24409 SDValue Vector = Value.getOperand(0);
24410 SDValue ExtIdx = Value.getOperand(1);
24411 EVT VectorVT = Vector.getValueType();
24412 EVT ElemVT = VectorVT.getVectorElementType();
24413
24414 if (!ValueVT.isInteger())
24415 return SDValue();
24416
24417 // Propagate zero constants (applying this fold may miss optimizations).
24419 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
24420 DAG.ReplaceAllUsesWith(Value, ZeroElt);
24421 return SDValue();
24422 }
24423
24424 if (ValueVT != MemVT && !ST->isTruncatingStore())
24425 return SDValue();
24426
24427 // This could generate an additional extract if the index is non-zero and
24428 // the extracted value has multiple uses.
24429 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24430 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24431 return SDValue();
24432
24433 // These can lower to st1, which is preferable if we're unlikely to fold the
24434 // addressing into the store.
24435 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24436 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24437 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24438 return SDValue();
24439
24440 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24441 // Heuristic: If there are other users of w/x integer scalars extracted
24442 // from this vector that won't fold into the store -- abandon folding.
24443 // Applying this fold may disrupt paired stores.
24444 for (const auto &Use : Vector->uses()) {
24445 if (Use.getResNo() != Vector.getResNo())
24446 continue;
24447 const SDNode *User = Use.getUser();
24448 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24449 (!User->hasOneUse() ||
24450 (*User->user_begin())->getOpcode() != ISD::STORE))
24451 return SDValue();
24452 }
24453 }
24454
24455 SDValue ExtVector = Vector;
24456 if (!ExtCst || !ExtCst->isZero()) {
24457 // Handle extracting from lanes != 0.
24459 Value.getValueType(), Vector, ExtIdx);
24460 SDValue Zero = DAG.getVectorIdxConstant(0, DL);
24461 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
24462 DAG.getUNDEF(VectorVT), Ext, Zero);
24463 }
24464
24465 EVT FPMemVT = MemVT == MVT::i8
24466 ? MVT::aarch64mfp8
24468 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24469 FPMemVT, ExtVector);
24470
24471 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
24472 ST->getMemOperand());
24473 }
24474
24475 return SDValue();
24476}
24477
24480 SelectionDAG &DAG,
24481 const AArch64Subtarget *Subtarget) {
24482 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
24483 SDValue Value = MST->getValue();
24484 SDValue Mask = MST->getMask();
24485 SDLoc DL(N);
24486
24487 // If this is a UZP1 followed by a masked store, fold this into a masked
24488 // truncating store. We can do this even if this is already a masked
24489 // truncstore.
24490 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24491 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24492 Value.getValueType().isInteger()) {
24493 Value = Value.getOperand(0);
24494 if (Value.getOpcode() == ISD::BITCAST) {
24495 EVT HalfVT =
24496 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
24497 EVT InVT = Value.getOperand(0).getValueType();
24498
24499 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
24500 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24501 unsigned PgPattern = Mask->getConstantOperandVal(0);
24502
24503 // Ensure we can double the size of the predicate pattern
24504 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24505 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24506 MinSVESize) {
24507 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
24508 PgPattern);
24509 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
24510 MST->getBasePtr(), MST->getOffset(), Mask,
24511 MST->getMemoryVT(), MST->getMemOperand(),
24512 MST->getAddressingMode(),
24513 /*IsTruncating=*/true);
24514 }
24515 }
24516 }
24517 }
24518
24519 if (MST->isTruncatingStore()) {
24520 EVT ValueVT = Value->getValueType(0);
24521 EVT MemVT = MST->getMemoryVT();
24522 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
24523 return SDValue();
24524 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24525 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24526 MST->getOffset(), MST->getMask(),
24527 MST->getMemoryVT(), MST->getMemOperand(),
24528 MST->getAddressingMode(), true);
24529 }
24530 }
24531
24532 return SDValue();
24533}
24534
24535/// \return true if part of the index was folded into the Base.
24536static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24537 SDLoc DL, SelectionDAG &DAG) {
24538 // This function assumes a vector of i64 indices.
24539 EVT IndexVT = Index.getValueType();
24540 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24541 return false;
24542
24543 // Simplify:
24544 // BasePtr = Ptr
24545 // Index = X + splat(Offset)
24546 // ->
24547 // BasePtr = Ptr + Offset * scale.
24548 // Index = X
24549 if (Index.getOpcode() == ISD::ADD) {
24550 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24551 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24552 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24553 Index = Index.getOperand(0);
24554 return true;
24555 }
24556 }
24557
24558 // Simplify:
24559 // BasePtr = Ptr
24560 // Index = (X + splat(Offset)) << splat(Shift)
24561 // ->
24562 // BasePtr = Ptr + (Offset << Shift) * scale)
24563 // Index = X << splat(shift)
24564 if (Index.getOpcode() == ISD::SHL &&
24565 Index.getOperand(0).getOpcode() == ISD::ADD) {
24566 SDValue Add = Index.getOperand(0);
24567 SDValue ShiftOp = Index.getOperand(1);
24568 SDValue OffsetOp = Add.getOperand(1);
24569 if (auto Shift = DAG.getSplatValue(ShiftOp))
24570 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
24571 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
24572 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24573 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24574 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
24575 Add.getOperand(0), ShiftOp);
24576 return true;
24577 }
24578 }
24579
24580 return false;
24581}
24582
24583// Analyse the specified address returning true if a more optimal addressing
24584// mode is available. When returning true all parameters are updated to reflect
24585// their recommended values.
24587 SDValue &BasePtr, SDValue &Index,
24588 SelectionDAG &DAG) {
24589 // Try to iteratively fold parts of the index into the base pointer to
24590 // simplify the index as much as possible.
24591 bool Changed = false;
24592 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
24593 Changed = true;
24594
24595 // Only consider element types that are pointer sized as smaller types can
24596 // be easily promoted.
24597 EVT IndexVT = Index.getValueType();
24598 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24599 return Changed;
24600
24601 // Can indices be trivially shrunk?
24602 EVT DataVT = N->getOperand(1).getValueType();
24603 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24604 // will later be re-extended to 64 bits in legalization
24605 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24606 return Changed;
24607 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24608 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24609 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24610 return true;
24611 }
24612
24613 // Match:
24614 // Index = step(const)
24615 int64_t Stride = 0;
24616 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24617 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24618 }
24619 // Match:
24620 // Index = step(const) << shift(const)
24621 else if (Index.getOpcode() == ISD::SHL &&
24622 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24623 SDValue RHS = Index.getOperand(1);
24624 if (auto *Shift =
24625 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
24626 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24627 Stride = Step << Shift->getZExtValue();
24628 }
24629 }
24630
24631 // Return early because no supported pattern is found.
24632 if (Stride == 0)
24633 return Changed;
24634
24635 if (Stride < std::numeric_limits<int32_t>::min() ||
24636 Stride > std::numeric_limits<int32_t>::max())
24637 return Changed;
24638
24639 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24640 unsigned MaxVScale =
24642 int64_t LastElementOffset =
24643 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24644
24645 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24646 LastElementOffset > std::numeric_limits<int32_t>::max())
24647 return Changed;
24648
24649 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24650 // Stride does not scale explicitly by 'Scale', because it happens in
24651 // the gather/scatter addressing mode.
24652 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
24653 return true;
24654}
24655
24658 if (!DCI.isBeforeLegalize())
24659 return SDValue();
24660 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
24661
24662 SDLoc DL(MGS);
24663 SDValue Chain = MGS->getChain();
24664 SDValue Scale = MGS->getScale();
24665 SDValue Index = MGS->getIndex();
24666 SDValue Mask = MGS->getMask();
24667 SDValue BasePtr = MGS->getBasePtr();
24668 ISD::MemIndexType IndexType = MGS->getIndexType();
24669
24670 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24671 return SDValue();
24672
24673 // Here we catch such cases early and change MGATHER's IndexType to allow
24674 // the use of an Index that's more legalisation friendly.
24675 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
24676 SDValue PassThru = MGT->getPassThru();
24677 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24678 return DAG.getMaskedGather(
24679 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
24680 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
24681 }
24682 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24683 SDValue Data = MSC->getValue();
24684 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24685 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24686 DL, Ops, MSC->getMemOperand(), IndexType,
24687 MSC->isTruncatingStore());
24688 }
24689 auto *HG = cast<MaskedHistogramSDNode>(MGS);
24690 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24691 Index, Scale, HG->getIntID()};
24692 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24693 DL, Ops, HG->getMemOperand(), IndexType);
24694}
24695
24696/// Target-specific DAG combine function for NEON load/store intrinsics
24697/// to merge base address updates.
24700 SelectionDAG &DAG) {
24701 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24702 return SDValue();
24703
24704 unsigned AddrOpIdx = N->getNumOperands() - 1;
24705 SDValue Addr = N->getOperand(AddrOpIdx);
24706
24707 // Search for a use of the address operand that is an increment.
24708 for (SDUse &Use : Addr->uses()) {
24709 SDNode *User = Use.getUser();
24710 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24711 continue;
24712
24713 // Check that the add is independent of the load/store. Otherwise, folding
24714 // it would create a cycle.
24717 Visited.insert(Addr.getNode());
24718 Worklist.push_back(N);
24719 Worklist.push_back(User);
24720 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
24721 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24722 continue;
24723
24724 // Find the new opcode for the updating load/store.
24725 bool IsStore = false;
24726 bool IsLaneOp = false;
24727 bool IsDupOp = false;
24728 unsigned NewOpc = 0;
24729 unsigned NumVecs = 0;
24730 unsigned IntNo = N->getConstantOperandVal(1);
24731 switch (IntNo) {
24732 default: llvm_unreachable("unexpected intrinsic for Neon base update");
24733 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
24734 NumVecs = 2; break;
24735 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
24736 NumVecs = 3; break;
24737 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
24738 NumVecs = 4; break;
24739 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
24740 NumVecs = 2; IsStore = true; break;
24741 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
24742 NumVecs = 3; IsStore = true; break;
24743 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
24744 NumVecs = 4; IsStore = true; break;
24745 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
24746 NumVecs = 2; break;
24747 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
24748 NumVecs = 3; break;
24749 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
24750 NumVecs = 4; break;
24751 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
24752 NumVecs = 2; IsStore = true; break;
24753 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
24754 NumVecs = 3; IsStore = true; break;
24755 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
24756 NumVecs = 4; IsStore = true; break;
24757 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
24758 NumVecs = 2; IsDupOp = true; break;
24759 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
24760 NumVecs = 3; IsDupOp = true; break;
24761 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
24762 NumVecs = 4; IsDupOp = true; break;
24763 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
24764 NumVecs = 2; IsLaneOp = true; break;
24765 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
24766 NumVecs = 3; IsLaneOp = true; break;
24767 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
24768 NumVecs = 4; IsLaneOp = true; break;
24769 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
24770 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
24771 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
24772 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
24773 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
24774 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
24775 }
24776
24777 EVT VecTy;
24778 if (IsStore)
24779 VecTy = N->getOperand(2).getValueType();
24780 else
24781 VecTy = N->getValueType(0);
24782
24783 // If the increment is a constant, it must match the memory ref size.
24784 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24785 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24786 uint32_t IncVal = CInc->getZExtValue();
24787 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
24788 if (IsLaneOp || IsDupOp)
24789 NumBytes /= VecTy.getVectorNumElements();
24790 if (IncVal != NumBytes)
24791 continue;
24792 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24793 }
24795 Ops.push_back(N->getOperand(0)); // Incoming chain
24796 // Load lane and store have vector list as input.
24797 if (IsLaneOp || IsStore)
24798 for (unsigned i = 2; i < AddrOpIdx; ++i)
24799 Ops.push_back(N->getOperand(i));
24800 Ops.push_back(Addr); // Base register
24801 Ops.push_back(Inc);
24802
24803 // Return Types.
24804 EVT Tys[6];
24805 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
24806 unsigned n;
24807 for (n = 0; n < NumResultVecs; ++n)
24808 Tys[n] = VecTy;
24809 Tys[n++] = MVT::i64; // Type of write back register
24810 Tys[n] = MVT::Other; // Type of the chain
24811 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
24812
24813 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
24814 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
24815 MemInt->getMemoryVT(),
24816 MemInt->getMemOperand());
24817
24818 // Update the uses.
24819 std::vector<SDValue> NewResults;
24820 for (unsigned i = 0; i < NumResultVecs; ++i) {
24821 NewResults.push_back(SDValue(UpdN.getNode(), i));
24822 }
24823 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
24824 DCI.CombineTo(N, NewResults);
24825 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
24826
24827 break;
24828 }
24829 return SDValue();
24830}
24831
24832// Checks to see if the value is the prescribed width and returns information
24833// about its extension mode.
24834static
24835bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24836 ExtType = ISD::NON_EXTLOAD;
24837 switch(V.getNode()->getOpcode()) {
24838 default:
24839 return false;
24840 case ISD::LOAD: {
24841 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
24842 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
24843 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
24844 ExtType = LoadNode->getExtensionType();
24845 return true;
24846 }
24847 return false;
24848 }
24849 case ISD::AssertSext: {
24850 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24851 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24852 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24853 ExtType = ISD::SEXTLOAD;
24854 return true;
24855 }
24856 return false;
24857 }
24858 case ISD::AssertZext: {
24859 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24860 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24861 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24862 ExtType = ISD::ZEXTLOAD;
24863 return true;
24864 }
24865 return false;
24866 }
24867 case ISD::Constant:
24868 case ISD::TargetConstant: {
24869 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
24870 1LL << (width - 1);
24871 }
24872 }
24873
24874 return true;
24875}
24876
24877// This function does a whole lot of voodoo to determine if the tests are
24878// equivalent without and with a mask. Essentially what happens is that given a
24879// DAG resembling:
24880//
24881// +-------------+ +-------------+ +-------------+ +-------------+
24882// | Input | | AddConstant | | CompConstant| | CC |
24883// +-------------+ +-------------+ +-------------+ +-------------+
24884// | | | |
24885// V V | +----------+
24886// +-------------+ +----+ | |
24887// | ADD | |0xff| | |
24888// +-------------+ +----+ | |
24889// | | | |
24890// V V | |
24891// +-------------+ | |
24892// | AND | | |
24893// +-------------+ | |
24894// | | |
24895// +-----+ | |
24896// | | |
24897// V V V
24898// +-------------+
24899// | CMP |
24900// +-------------+
24901//
24902// The AND node may be safely removed for some combinations of inputs. In
24903// particular we need to take into account the extension type of the Input,
24904// the exact values of AddConstant, CompConstant, and CC, along with the nominal
24905// width of the input (this can work for any width inputs, the above graph is
24906// specific to 8 bits.
24907//
24908// The specific equations were worked out by generating output tables for each
24909// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
24910// problem was simplified by working with 4 bit inputs, which means we only
24911// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
24912// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
24913// patterns present in both extensions (0,7). For every distinct set of
24914// AddConstant and CompConstants bit patterns we can consider the masked and
24915// unmasked versions to be equivalent if the result of this function is true for
24916// all 16 distinct bit patterns of for the current extension type of Input (w0).
24917//
24918// sub w8, w0, w1
24919// and w10, w8, #0x0f
24920// cmp w8, w2
24921// cset w9, AArch64CC
24922// cmp w10, w2
24923// cset w11, AArch64CC
24924// cmp w9, w11
24925// cset w0, eq
24926// ret
24927//
24928// Since the above function shows when the outputs are equivalent it defines
24929// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
24930// would be expensive to run during compiles. The equations below were written
24931// in a test harness that confirmed they gave equivalent outputs to the above
24932// for all inputs function, so they can be used determine if the removal is
24933// legal instead.
24934//
24935// isEquivalentMaskless() is the code for testing if the AND can be removed
24936// factored out of the DAG recognition as the DAG can take several forms.
24937
24938static bool isEquivalentMaskless(unsigned CC, unsigned width,
24939 ISD::LoadExtType ExtType, int AddConstant,
24940 int CompConstant) {
24941 // By being careful about our equations and only writing the in term
24942 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
24943 // make them generally applicable to all bit widths.
24944 int MaxUInt = (1 << width);
24945
24946 // For the purposes of these comparisons sign extending the type is
24947 // equivalent to zero extending the add and displacing it by half the integer
24948 // width. Provided we are careful and make sure our equations are valid over
24949 // the whole range we can just adjust the input and avoid writing equations
24950 // for sign extended inputs.
24951 if (ExtType == ISD::SEXTLOAD)
24952 AddConstant -= (1 << (width-1));
24953
24954 switch(CC) {
24955 case AArch64CC::LE:
24956 case AArch64CC::GT:
24957 if ((AddConstant == 0) ||
24958 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
24959 (AddConstant >= 0 && CompConstant < 0) ||
24960 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
24961 return true;
24962 break;
24963 case AArch64CC::LT:
24964 case AArch64CC::GE:
24965 if ((AddConstant == 0) ||
24966 (AddConstant >= 0 && CompConstant <= 0) ||
24967 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
24968 return true;
24969 break;
24970 case AArch64CC::HI:
24971 case AArch64CC::LS:
24972 if ((AddConstant >= 0 && CompConstant < 0) ||
24973 (AddConstant <= 0 && CompConstant >= -1 &&
24974 CompConstant < AddConstant + MaxUInt))
24975 return true;
24976 break;
24977 case AArch64CC::PL:
24978 case AArch64CC::MI:
24979 if ((AddConstant == 0) ||
24980 (AddConstant > 0 && CompConstant <= 0) ||
24981 (AddConstant < 0 && CompConstant <= AddConstant))
24982 return true;
24983 break;
24984 case AArch64CC::LO:
24985 case AArch64CC::HS:
24986 if ((AddConstant >= 0 && CompConstant <= 0) ||
24987 (AddConstant <= 0 && CompConstant >= 0 &&
24988 CompConstant <= AddConstant + MaxUInt))
24989 return true;
24990 break;
24991 case AArch64CC::EQ:
24992 case AArch64CC::NE:
24993 if ((AddConstant > 0 && CompConstant < 0) ||
24994 (AddConstant < 0 && CompConstant >= 0 &&
24995 CompConstant < AddConstant + MaxUInt) ||
24996 (AddConstant >= 0 && CompConstant >= 0 &&
24997 CompConstant >= AddConstant) ||
24998 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
24999 return true;
25000 break;
25001 case AArch64CC::VS:
25002 case AArch64CC::VC:
25003 case AArch64CC::AL:
25004 case AArch64CC::NV:
25005 return true;
25006 case AArch64CC::Invalid:
25007 break;
25008 }
25009
25010 return false;
25011}
25012
25013// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
25014// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
25016 SDNode *AndNode, SelectionDAG &DAG,
25017 unsigned CCIndex, unsigned CmpIndex,
25018 unsigned CC) {
25019 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
25020 if (!SubsC)
25021 return SDValue();
25022
25023 APInt SubsAP = SubsC->getAPIntValue();
25024 if (CC == AArch64CC::HI) {
25025 if (!SubsAP.isMask())
25026 return SDValue();
25027 } else if (CC == AArch64CC::LO) {
25028 if (!SubsAP.isPowerOf2())
25029 return SDValue();
25030 } else
25031 return SDValue();
25032
25033 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
25034 if (!AndC)
25035 return SDValue();
25036
25037 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
25038
25039 SDLoc DL(N);
25040 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
25041 SDValue ANDS = DAG.getNode(
25042 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
25043 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
25044 SDValue AArch64_CC =
25046 N->getOperand(CCIndex)->getValueType(0));
25047
25048 // For now, only performCSELCombine and performBRCONDCombine call this
25049 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
25050 // operands. So just init the ops direct to simplify the code. If we have some
25051 // other case with different CCIndex, CmpIndex, we need to use for loop to
25052 // rewrite the code here.
25053 // TODO: Do we need to assert number of operand is 4 here?
25054 assert((CCIndex == 2 && CmpIndex == 3) &&
25055 "Expected CCIndex to be 2 and CmpIndex to be 3.");
25056 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
25057 ANDS.getValue(1)};
25058 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
25059}
25060
25061static
25064 SelectionDAG &DAG, unsigned CCIndex,
25065 unsigned CmpIndex) {
25066 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
25067 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
25068 unsigned CondOpcode = SubsNode->getOpcode();
25069
25070 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
25071 !SubsNode->hasOneUse())
25072 return SDValue();
25073
25074 // There is a SUBS feeding this condition. Is it fed by a mask we can
25075 // use?
25076
25077 SDNode *AndNode = SubsNode->getOperand(0).getNode();
25078 unsigned MaskBits = 0;
25079
25080 if (AndNode->getOpcode() != ISD::AND)
25081 return SDValue();
25082
25083 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
25084 CmpIndex, CC))
25085 return Val;
25086
25087 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
25088 uint32_t CNV = CN->getZExtValue();
25089 if (CNV == 255)
25090 MaskBits = 8;
25091 else if (CNV == 65535)
25092 MaskBits = 16;
25093 }
25094
25095 if (!MaskBits)
25096 return SDValue();
25097
25098 SDValue AddValue = AndNode->getOperand(0);
25099
25100 if (AddValue.getOpcode() != ISD::ADD)
25101 return SDValue();
25102
25103 // The basic dag structure is correct, grab the inputs and validate them.
25104
25105 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
25106 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
25107 SDValue SubsInputValue = SubsNode->getOperand(1);
25108
25109 // The mask is present and the provenance of all the values is a smaller type,
25110 // lets see if the mask is superfluous.
25111
25112 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
25113 !isa<ConstantSDNode>(SubsInputValue.getNode()))
25114 return SDValue();
25115
25116 ISD::LoadExtType ExtType;
25117
25118 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
25119 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
25120 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
25121 return SDValue();
25122
25123 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
25124 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
25125 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
25126 return SDValue();
25127
25128 // The AND is not necessary, remove it.
25129
25130 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
25131 SubsNode->getValueType(1));
25132 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
25133
25134 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
25135 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
25136
25137 return SDValue(N, 0);
25138}
25139
25140// Optimize compare with zero and branch.
25143 SelectionDAG &DAG) {
25145 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
25146 // will not be produced, as they are conditional branch instructions that do
25147 // not set flags.
25148 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
25149 return SDValue();
25150
25151 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
25152 N = NV.getNode();
25153 SDValue Chain = N->getOperand(0);
25154 SDValue Dest = N->getOperand(1);
25155 SDValue CCVal = N->getOperand(2);
25156 SDValue Cmp = N->getOperand(3);
25157
25158 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
25159 unsigned CC = CCVal->getAsZExtVal();
25160 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
25161 return SDValue();
25162
25163 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
25164 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
25165 SDValue CSel = Cmp.getOperand(0);
25166 auto CSelCC = getCSETCondCode(CSel);
25167 if (CSelCC) {
25168 SDLoc DL(N);
25169 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
25170 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
25171 CSel.getOperand(3));
25172 }
25173 }
25174
25175 unsigned CmpOpc = Cmp.getOpcode();
25176 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
25177 return SDValue();
25178
25179 // Only attempt folding if there is only one use of the flag and no use of the
25180 // value.
25181 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
25182 return SDValue();
25183
25184 SDValue LHS = Cmp.getOperand(0);
25185 SDValue RHS = Cmp.getOperand(1);
25186
25187 assert(LHS.getValueType() == RHS.getValueType() &&
25188 "Expected the value type to be the same for both operands!");
25189 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
25190 return SDValue();
25191
25192 if (isNullConstant(LHS))
25193 std::swap(LHS, RHS);
25194
25195 if (!isNullConstant(RHS))
25196 return SDValue();
25197
25198 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
25199 LHS.getOpcode() == ISD::SRL)
25200 return SDValue();
25201
25202 // Fold the compare into the branch instruction.
25203 SDValue BR;
25204 if (CC == AArch64CC::EQ)
25205 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25206 else
25207 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25208
25209 // Do not add new nodes to DAG combiner worklist.
25210 DCI.CombineTo(N, BR, false);
25211
25212 return SDValue();
25213}
25214
25216 unsigned CC = N->getConstantOperandVal(2);
25217 SDValue SUBS = N->getOperand(3);
25218 SDValue Zero, CTTZ;
25219
25220 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
25221 Zero = N->getOperand(0);
25222 CTTZ = N->getOperand(1);
25223 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
25224 Zero = N->getOperand(1);
25225 CTTZ = N->getOperand(0);
25226 } else
25227 return SDValue();
25228
25229 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
25230 (CTTZ.getOpcode() == ISD::TRUNCATE &&
25231 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
25232 return SDValue();
25233
25234 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
25235 "Illegal type in CTTZ folding");
25236
25237 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
25238 return SDValue();
25239
25240 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
25241 ? CTTZ.getOperand(0).getOperand(0)
25242 : CTTZ.getOperand(0);
25243
25244 if (X != SUBS.getOperand(0))
25245 return SDValue();
25246
25247 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
25248 ? CTTZ.getOperand(0).getValueSizeInBits()
25249 : CTTZ.getValueSizeInBits();
25250 SDValue BitWidthMinusOne =
25251 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
25252 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
25253 BitWidthMinusOne);
25254}
25255
25256// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
25257// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
25258// Where x and y are constants and x != y
25259
25260// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
25261// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
25262// Where x and y are constants and x != y
25264 SDValue L = Op->getOperand(0);
25265 SDValue R = Op->getOperand(1);
25266 AArch64CC::CondCode OpCC =
25267 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25268
25269 SDValue OpCmp = Op->getOperand(3);
25270 if (!isCMP(OpCmp))
25271 return SDValue();
25272
25273 SDValue CmpLHS = OpCmp.getOperand(0);
25274 SDValue CmpRHS = OpCmp.getOperand(1);
25275
25276 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
25277 std::swap(CmpLHS, CmpRHS);
25278 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
25279 return SDValue();
25280
25281 SDValue X = CmpLHS->getOperand(0);
25282 SDValue Y = CmpLHS->getOperand(1);
25283 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
25284 return SDValue();
25285 }
25286
25287 // If one of the constant is opaque constant, x,y sdnode is still different
25288 // but the real value maybe the same. So check APInt here to make sure the
25289 // code is correct.
25290 ConstantSDNode *CX = cast<ConstantSDNode>(X);
25291 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
25292 if (CX->getAPIntValue() == CY->getAPIntValue())
25293 return SDValue();
25294
25296 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
25297 SDValue Cond = CmpLHS->getOperand(3);
25298
25299 if (CmpRHS == Y)
25301 else if (CmpRHS != X)
25302 return SDValue();
25303
25304 if (OpCC == AArch64CC::NE)
25306 else if (OpCC != AArch64CC::EQ)
25307 return SDValue();
25308
25309 SDLoc DL(Op);
25310 EVT VT = Op->getValueType(0);
25311
25312 SDValue CCValue = getCondCode(DAG, CC);
25313 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
25314}
25315
25316// Reassociate the true/false expressions of a CSEL instruction to obtain a
25317// common subexpression with the comparison instruction. For example, change
25318// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
25319// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
25320// subexpression.
25322 SDValue SubsNode = N->getOperand(3);
25323 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
25324 return SDValue();
25325
25326 SDValue CmpOpToMatch = SubsNode.getOperand(1);
25327 SDValue CmpOpOther = SubsNode.getOperand(0);
25328 EVT VT = N->getValueType(0);
25329
25330 unsigned ExpectedOpcode;
25331 SDValue ExpectedOp;
25332 SDValue SubsOp;
25333 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
25334 if (CmpOpConst) {
25335 ExpectedOpcode = ISD::ADD;
25336 ExpectedOp =
25337 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25338 CmpOpConst->getValueType(0));
25339 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25340 CmpOpConst->getValueType(0));
25341 } else {
25342 ExpectedOpcode = ISD::SUB;
25343 ExpectedOp = CmpOpToMatch;
25344 SubsOp = CmpOpToMatch;
25345 }
25346
25347 // Get the operand that can be reassociated with the SUBS instruction.
25348 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
25349 if (Op.getOpcode() != ExpectedOpcode)
25350 return SDValue();
25351 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
25352 !Op.getOperand(0).hasOneUse())
25353 return SDValue();
25354 SDValue X = Op.getOperand(0).getOperand(0);
25355 SDValue Y = Op.getOperand(0).getOperand(1);
25356 if (X != CmpOpOther)
25357 std::swap(X, Y);
25358 if (X != CmpOpOther)
25359 return SDValue();
25360 if (ExpectedOp != Op.getOperand(1))
25361 return SDValue();
25362 return Y;
25363 };
25364
25365 // Try the reassociation using the given constant and condition code.
25366 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
25367 SDValue SubsOp) {
25368 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
25369 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
25370 if (!TReassocOp && !FReassocOp)
25371 return SDValue();
25372
25373 SDValue NewCmp =
25374 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
25375 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
25376
25377 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
25378 if (!ReassocOp)
25379 return N->getOperand(OpNum);
25380 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
25381 NewCmp.getValue(0), ReassocOp);
25382 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
25383 return Res;
25384 };
25385
25386 SDValue TValReassoc = Reassociate(TReassocOp, 0);
25387 SDValue FValReassoc = Reassociate(FReassocOp, 1);
25388 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
25389 getCondCode(DAG, NewCC), NewCmp.getValue(1));
25390 };
25391
25392 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25393
25394 // First, try to eliminate the compare instruction by searching for a
25395 // subtraction with the same constant.
25396 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
25397 return R;
25398
25399 if (!CmpOpConst) {
25400 // Try again with the operands of the SUBS instruction and the condition
25401 // swapped. Due to canonicalization, this only helps for non-constant
25402 // operands of the SUBS instruction.
25403 std::swap(CmpOpToMatch, CmpOpOther);
25404 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
25405 return R;
25406 return SDValue();
25407 }
25408
25409 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
25410 return SDValue();
25411
25412 // Next, search for a subtraction with a slightly different constant. By
25413 // adjusting the condition code, we can still eliminate the compare
25414 // instruction. Adjusting the constant is only valid if it does not result
25415 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
25416 // Since such comparisons are trivially true/false, we should not encounter
25417 // them here but check for them nevertheless to be on the safe side.
25418 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
25419 AArch64CC::CondCode NewCC) {
25420 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
25421 CmpOpConst->getValueType(0));
25422 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
25423 CmpOpConst->getValueType(0));
25424 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25425 };
25426 switch (CC) {
25427 case AArch64CC::EQ:
25428 case AArch64CC::LS:
25429 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25430 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25431 case AArch64CC::NE:
25432 case AArch64CC::HI:
25433 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25434 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25435 case AArch64CC::LO:
25436 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25437 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25438 case AArch64CC::HS:
25439 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25440 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25441 case AArch64CC::LT:
25442 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25443 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25444 case AArch64CC::LE:
25445 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25446 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25447 case AArch64CC::GT:
25448 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25449 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25450 case AArch64CC::GE:
25451 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25452 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25453 default:
25454 return SDValue();
25455 }
25456}
25457
25459 AArch64CC::CondCode OpCC =
25460 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25461
25462 if (OpCC != AArch64CC::NE)
25463 return SDValue();
25464
25465 SDValue PTest = Op->getOperand(3);
25466 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25467 return SDValue();
25468
25469 SDValue TruePred = PTest.getOperand(0);
25470 SDValue AnyPred = PTest.getOperand(1);
25471
25472 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25473 TruePred = TruePred.getOperand(0);
25474
25475 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25476 AnyPred = AnyPred.getOperand(0);
25477
25478 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
25479 return SDValue();
25480
25481 SDValue LastB = Op->getOperand(0);
25482 SDValue Default = Op->getOperand(1);
25483
25484 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
25485 return SDValue();
25486
25487 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
25488 AnyPred, Default, LastB.getOperand(1));
25489}
25490
25491// Optimize CSEL instructions
25494 SelectionDAG &DAG) {
25495 // CSEL x, x, cc -> x
25496 if (N->getOperand(0) == N->getOperand(1))
25497 return N->getOperand(0);
25498
25499 if (SDValue R = foldCSELOfCSEL(N, DAG))
25500 return R;
25501
25502 // Try to reassociate the true/false expressions so that we can do CSE with
25503 // a SUBS instruction used to perform the comparison.
25505 return R;
25506
25507 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25508 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25509 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25510 return Folded;
25511
25512 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25513 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25514 SDValue Cond = N->getOperand(3);
25515 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25516 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
25517 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25518 {Cond.getOperand(1), Cond.getOperand(0)}) &&
25519 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25520 {Cond.getOperand(0), Cond.getOperand(1)}) &&
25521 !isNullConstant(Cond.getOperand(1))) {
25522 AArch64CC::CondCode OldCond =
25523 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25524 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
25525 if (NewCond != AArch64CC::AL) {
25526 SDLoc DL(N);
25527 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25528 Cond.getOperand(1), Cond.getOperand(0));
25529 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
25530 N->getOperand(1), getCondCode(DAG, NewCond),
25531 Sub.getValue(1));
25532 }
25533 }
25534
25535 // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
25536 // use overflow flags, to avoid the comparison with zero. In case of success,
25537 // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
25538 // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
25539 // nodes with their SUBS equivalent as is already done for other flag-setting
25540 // operators, in which case doing the replacement here becomes redundant.
25541 if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
25542 isNullConstant(Cond.getOperand(1))) {
25543 SDValue Sub = Cond.getOperand(0);
25545 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25546 if (Sub.getOpcode() == ISD::SUB &&
25547 (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
25548 CC == AArch64CC::PL)) {
25549 SDLoc DL(N);
25550 SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25551 Sub.getOperand(0), Sub.getOperand(1));
25552 DCI.CombineTo(Sub.getNode(), Subs);
25553 DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
25554 return SDValue(N, 0);
25555 }
25556 }
25557
25558 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
25559 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
25560 return CondLast;
25561
25562 return performCONDCombine(N, DCI, DAG, 2, 3);
25563}
25564
25565// Try to re-use an already extended operand of a vector SetCC feeding a
25566// extended select. Doing so avoids requiring another full extension of the
25567// SET_CC result when lowering the select.
25569 EVT Op0MVT = Op->getOperand(0).getValueType();
25570 if (!Op0MVT.isVector() || Op->use_empty())
25571 return SDValue();
25572
25573 // Make sure that all uses of Op are VSELECTs with result matching types where
25574 // the result type has a larger element type than the SetCC operand.
25575 SDNode *FirstUse = *Op->user_begin();
25576 if (FirstUse->getOpcode() != ISD::VSELECT)
25577 return SDValue();
25578 EVT UseMVT = FirstUse->getValueType(0);
25579 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
25580 return SDValue();
25581 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
25582 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
25583 }))
25584 return SDValue();
25585
25586 APInt V;
25587 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
25588 return SDValue();
25589
25590 SDLoc DL(Op);
25591 SDValue Op0ExtV;
25592 SDValue Op1ExtV;
25593 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
25594 // Check if the first operand of the SET_CC is already extended. If it is,
25595 // split the SET_CC and re-use the extended version of the operand.
25596 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
25597 Op->getOperand(0));
25598 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
25599 Op->getOperand(0));
25600 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25601 Op0ExtV = SDValue(Op0SExt, 0);
25602 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
25603 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25604 Op0ExtV = SDValue(Op0ZExt, 0);
25605 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
25606 } else
25607 return SDValue();
25608
25609 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
25610 Op0ExtV, Op1ExtV, Op->getOperand(2));
25611}
25612
25613static SDValue
25615 SelectionDAG &DAG) {
25616 SDValue Vec = N->getOperand(0);
25617 if (DCI.isBeforeLegalize() &&
25618 Vec.getValueType().getVectorElementType() == MVT::i1 &&
25621 SDLoc DL(N);
25622 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
25623 DAG);
25624 }
25625
25626 return SDValue();
25627}
25628
25631 SelectionDAG &DAG) {
25632 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25633 SDValue LHS = N->getOperand(0);
25634 SDValue RHS = N->getOperand(1);
25635 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
25636 SDLoc DL(N);
25637 EVT VT = N->getValueType(0);
25638
25639 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
25640 return V;
25641
25642 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
25643 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
25644 LHS->getOpcode() == AArch64ISD::CSEL &&
25645 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
25646 LHS->hasOneUse()) {
25647 // Invert CSEL's condition.
25648 auto OldCond =
25649 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
25650 auto NewCond = getInvertedCondCode(OldCond);
25651
25652 // csel 0, 1, !cond, X
25653 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
25654 LHS.getOperand(0), LHS.getOperand(1),
25655 getCondCode(DAG, NewCond), LHS.getOperand(3));
25656 return DAG.getZExtOrTrunc(CSEL, DL, VT);
25657 }
25658
25659 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
25660 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
25661 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
25662 LHS->hasOneUse()) {
25663 EVT TstVT = LHS->getValueType(0);
25664 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
25665 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
25666 // this pattern will get better opt in emitComparison
25667 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
25668 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
25669 DAG.getSignedConstant(TstImm, DL, TstVT));
25670 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
25671 }
25672 }
25673
25674 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25675 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25676 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25677 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25678 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25679 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25681 LHS->getOpcode() == ISD::BITCAST) {
25682 EVT ToVT = LHS->getValueType(0);
25683 EVT FromVT = LHS->getOperand(0).getValueType();
25684 if (FromVT.isFixedLengthVector() &&
25685 FromVT.getVectorElementType() == MVT::i1) {
25686 bool IsNull = isNullConstant(RHS);
25688 DL, MVT::i1, LHS->getOperand(0));
25689 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
25690 LHS);
25691 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25692 }
25693 }
25694
25695 // Try to perform the memcmp when the result is tested for [in]equality with 0
25696 if (SDValue V = performOrXorChainCombine(N, DAG))
25697 return V;
25698
25699 EVT CmpVT = LHS.getValueType();
25700
25701 // NOTE: This exists as a combine only because it proved too awkward to match
25702 // splat(1) across all the NEON types during isel.
25703 APInt SplatLHSVal;
25704 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
25705 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
25706 SplatLHSVal.isOne())
25707 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
25708
25709 return SDValue();
25710}
25711
25712// Replace a flag-setting operator (eg ANDS) with the generic version
25713// (eg AND) if the flag is unused.
25716 unsigned GenericOpcode) {
25717 SDLoc DL(N);
25718 SDValue LHS = N->getOperand(0);
25719 SDValue RHS = N->getOperand(1);
25720 EVT VT = N->getValueType(0);
25721
25722 // If the flag result isn't used, convert back to a generic opcode.
25723 if (!N->hasAnyUseOfValue(1)) {
25724 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
25725 return DCI.CombineTo(N, Res, SDValue(N, 1));
25726 }
25727
25728 // Combine identical generic nodes into this node, re-using the result.
25729 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25730 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
25731 DCI.CombineTo(Generic, SDValue(N, 0));
25732
25733 return SDValue();
25734}
25735
25737 // setcc_merge_zero pred
25738 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25739 // => extract_subvector (inner setcc_merge_zero)
25740 SDValue Pred = N->getOperand(0);
25741 SDValue LHS = N->getOperand(1);
25742 SDValue RHS = N->getOperand(2);
25743 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25744
25745 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
25746 LHS->getOpcode() != ISD::SIGN_EXTEND)
25747 return SDValue();
25748
25749 SDValue Extract = LHS->getOperand(0);
25750 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25751 Extract->getValueType(0) != N->getValueType(0) ||
25752 Extract->getConstantOperandVal(1) != 0)
25753 return SDValue();
25754
25755 SDValue InnerSetCC = Extract->getOperand(0);
25756 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25757 return SDValue();
25758
25759 // By this point we've effectively got
25760 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25761 // lanes are already zero then the trunc(sext()) sequence is redundant and we
25762 // can operate on A directly.
25763 SDValue InnerPred = InnerSetCC.getOperand(0);
25764 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25765 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25766 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
25767 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
25768 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
25769 return Extract;
25770
25771 return SDValue();
25772}
25773
25774static SDValue
25776 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25777 "Unexpected opcode!");
25778
25779 SelectionDAG &DAG = DCI.DAG;
25780 SDValue Pred = N->getOperand(0);
25781 SDValue LHS = N->getOperand(1);
25782 SDValue RHS = N->getOperand(2);
25783 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25784
25785 if (SDValue V = performSetCCPunpkCombine(N, DAG))
25786 return V;
25787
25788 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
25789 LHS->getOpcode() == ISD::SIGN_EXTEND &&
25790 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
25791 // setcc_merge_zero(
25792 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25793 // => setcc_merge_zero(pred, ...)
25794 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25795 LHS->getOperand(0)->getOperand(0) == Pred)
25796 return LHS->getOperand(0);
25797
25798 // setcc_merge_zero(
25799 // all_active, extend(nxvNi1 ...), != splat(0))
25800 // -> nxvNi1 ...
25801 if (isAllActivePredicate(DAG, Pred))
25802 return LHS->getOperand(0);
25803
25804 // setcc_merge_zero(
25805 // pred, extend(nxvNi1 ...), != splat(0))
25806 // -> nxvNi1 and(pred, ...)
25807 if (DCI.isAfterLegalizeDAG())
25808 // Do this after legalization to allow more folds on setcc_merge_zero
25809 // to be recognized.
25810 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
25811 LHS->getOperand(0), Pred);
25812 }
25813
25814 return SDValue();
25815}
25816
25817// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
25818// as well as whether the test should be inverted. This code is required to
25819// catch these cases (as opposed to standard dag combines) because
25820// AArch64ISD::TBZ is matched during legalization.
25821static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25822 SelectionDAG &DAG) {
25823
25824 if (!Op->hasOneUse())
25825 return Op;
25826
25827 // We don't handle undef/constant-fold cases below, as they should have
25828 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
25829 // etc.)
25830
25831 // (tbz (trunc x), b) -> (tbz x, b)
25832 // This case is just here to enable more of the below cases to be caught.
25833 if (Op->getOpcode() == ISD::TRUNCATE &&
25834 Bit < Op->getValueType(0).getSizeInBits()) {
25835 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25836 }
25837
25838 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25839 if (Op->getOpcode() == ISD::ANY_EXTEND &&
25840 Bit < Op->getOperand(0).getValueSizeInBits()) {
25841 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25842 }
25843
25844 if (Op->getNumOperands() != 2)
25845 return Op;
25846
25847 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
25848 if (!C)
25849 return Op;
25850
25851 switch (Op->getOpcode()) {
25852 default:
25853 return Op;
25854
25855 // (tbz (and x, m), b) -> (tbz x, b)
25856 case ISD::AND:
25857 if ((C->getZExtValue() >> Bit) & 1)
25858 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25859 return Op;
25860
25861 // (tbz (shl x, c), b) -> (tbz x, b-c)
25862 case ISD::SHL:
25863 if (C->getZExtValue() <= Bit &&
25864 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25865 Bit = Bit - C->getZExtValue();
25866 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25867 }
25868 return Op;
25869
25870 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
25871 case ISD::SRA:
25872 Bit = Bit + C->getZExtValue();
25873 if (Bit >= Op->getValueType(0).getSizeInBits())
25874 Bit = Op->getValueType(0).getSizeInBits() - 1;
25875 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25876
25877 // (tbz (srl x, c), b) -> (tbz x, b+c)
25878 case ISD::SRL:
25879 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25880 Bit = Bit + C->getZExtValue();
25881 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25882 }
25883 return Op;
25884
25885 // (tbz (xor x, -1), b) -> (tbnz x, b)
25886 case ISD::XOR:
25887 if ((C->getZExtValue() >> Bit) & 1)
25888 Invert = !Invert;
25889 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25890 }
25891}
25892
25893// Optimize test single bit zero/non-zero and branch.
25896 SelectionDAG &DAG) {
25897 unsigned Bit = N->getConstantOperandVal(2);
25898 bool Invert = false;
25899 SDValue TestSrc = N->getOperand(1);
25900 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
25901
25902 if (TestSrc == NewTestSrc)
25903 return SDValue();
25904
25905 unsigned NewOpc = N->getOpcode();
25906 if (Invert) {
25907 if (NewOpc == AArch64ISD::TBZ)
25908 NewOpc = AArch64ISD::TBNZ;
25909 else {
25910 assert(NewOpc == AArch64ISD::TBNZ);
25911 NewOpc = AArch64ISD::TBZ;
25912 }
25913 }
25914
25915 SDLoc DL(N);
25916 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
25917 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
25918}
25919
25920// Swap vselect operands where it may allow a predicated operation to achieve
25921// the `sel`.
25922//
25923// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
25924// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
25926 auto SelectA = N->getOperand(1);
25927 auto SelectB = N->getOperand(2);
25928 auto NTy = N->getValueType(0);
25929
25930 if (!NTy.isScalableVector())
25931 return SDValue();
25932 SDValue SetCC = N->getOperand(0);
25933 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
25934 return SDValue();
25935
25936 switch (SelectB.getOpcode()) {
25937 default:
25938 return SDValue();
25939 case ISD::FMUL:
25940 case ISD::FSUB:
25941 case ISD::FADD:
25942 break;
25943 }
25944 if (SelectA != SelectB.getOperand(0))
25945 return SDValue();
25946
25947 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
25948 ISD::CondCode InverseCC =
25950 auto InverseSetCC =
25951 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
25952 SetCC.getOperand(1), InverseCC);
25953
25954 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
25955 {InverseSetCC, SelectB, SelectA});
25956}
25957
25958// vselect (v1i1 setcc) ->
25959// vselect (v1iXX setcc) (XX is the size of the compared operand type)
25960// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
25961// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
25962// such VSELECT.
25964 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
25965 return SwapResult;
25966
25967 SDValue N0 = N->getOperand(0);
25968 SDValue IfTrue = N->getOperand(1);
25969 SDValue IfFalse = N->getOperand(2);
25970 EVT ResVT = N->getValueType(0);
25971 EVT CCVT = N0.getValueType();
25972
25973 if (isAllActivePredicate(DAG, N0))
25974 return N->getOperand(1);
25975
25976 if (isAllInactivePredicate(N0))
25977 return N->getOperand(2);
25978
25979 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
25980 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
25981 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
25982 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
25983 // -> merge_pasthru_op A, B,{Bn,} C
25984 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
25985 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
25986 IfTrue->getOperand(0) == N0) {
25987 SmallVector<SDValue, 4> Ops(IfTrue->op_values());
25988 Ops[0] = N0;
25989 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
25990
25991 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
25992 }
25993 }
25994
25995 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
25996 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
25997 // supported types.
25998 SDValue SetCC = N->getOperand(0);
25999 if (SetCC.getOpcode() == ISD::SETCC &&
26000 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
26001 SDValue CmpLHS = SetCC.getOperand(0);
26002 EVT VT = CmpLHS.getValueType();
26003 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
26004 SDNode *SplatLHS = N->getOperand(1).getNode();
26005 SDNode *SplatRHS = N->getOperand(2).getNode();
26006 APInt SplatLHSVal;
26007 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
26008 VT.isSimple() &&
26009 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
26010 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
26011 VT.getSimpleVT().SimpleTy) &&
26012 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
26013 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
26015 unsigned NumElts = VT.getVectorNumElements();
26017 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
26018 VT.getScalarType()));
26019 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
26020
26021 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
26022 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
26023 return Or;
26024 }
26025 }
26026
26027 EVT CmpVT = N0.getOperand(0).getValueType();
26028 if (N0.getOpcode() != ISD::SETCC ||
26030 CCVT.getVectorElementType() != MVT::i1 ||
26032 return SDValue();
26033
26034 // Only combine when the result type is of the same size as the compared
26035 // operands.
26036 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
26037 return SDValue();
26038
26039 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
26040 N0.getOperand(0), N0.getOperand(1),
26041 cast<CondCodeSDNode>(N0.getOperand(2))->get());
26042 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
26043 IfTrue, IfFalse);
26044}
26045
26046/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
26047/// the compare-mask instructions rather than going via NZCV, even if LHS and
26048/// RHS are really scalar. This replaces any scalar setcc in the above pattern
26049/// with a vector one followed by a DUP shuffle on the result.
26052 SelectionDAG &DAG = DCI.DAG;
26053 SDValue N0 = N->getOperand(0);
26054 EVT ResVT = N->getValueType(0);
26055
26056 if (N0.getOpcode() != ISD::SETCC)
26057 return SDValue();
26058
26059 if (ResVT.isScalableVT())
26060 return SDValue();
26061
26062 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
26063 // scalar SetCCResultType. We also don't expect vectors, because we assume
26064 // that selects fed by vector SETCCs are canonicalized to VSELECT.
26065 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
26066 "Scalar-SETCC feeding SELECT has unexpected result type!");
26067
26068 // If NumMaskElts == 0, the comparison is larger than select result. The
26069 // largest real NEON comparison is 64-bits per lane, which means the result is
26070 // at most 32-bits and an illegal vector. Just bail out for now.
26071 EVT SrcVT = N0.getOperand(0).getValueType();
26072
26073 // Don't try to do this optimization when the setcc itself has i1 operands.
26074 // There are no legal vectors of i1, so this would be pointless. v1f16 is
26075 // ruled out to prevent the creation of setcc that need to be scalarized.
26076 if (SrcVT == MVT::i1 ||
26077 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
26078 return SDValue();
26079
26080 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
26081 if (!ResVT.isVector() || NumMaskElts == 0)
26082 return SDValue();
26083
26084 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
26086
26087 // Also bail out if the vector CCVT isn't the same size as ResVT.
26088 // This can happen if the SETCC operand size doesn't divide the ResVT size
26089 // (e.g., f64 vs v3f32).
26090 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
26091 return SDValue();
26092
26093 // Make sure we didn't create illegal types, if we're not supposed to.
26094 assert(DCI.isBeforeLegalize() ||
26095 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
26096
26097 // First perform a vector comparison, where lane 0 is the one we're interested
26098 // in.
26099 SDLoc DL(N0);
26100 SDValue LHS =
26101 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
26102 SDValue RHS =
26103 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
26104 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
26105
26106 // Now duplicate the comparison mask we want across all other lanes.
26107 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
26108 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
26109 Mask = DAG.getNode(ISD::BITCAST, DL,
26110 ResVT.changeVectorElementTypeToInteger(), Mask);
26111
26112 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
26113}
26114
26117 EVT VT = N->getValueType(0);
26118 SDLoc DL(N);
26119 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
26120 // 128bit vector version.
26121 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
26123 SmallVector<SDValue> Ops(N->ops());
26124 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
26125 DCI.DAG.getVTList(LVT), Ops)) {
26126 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
26127 DCI.DAG.getConstant(0, DL, MVT::i64));
26128 }
26129 }
26130
26131 if (N->getOpcode() == AArch64ISD::DUP) {
26132 // If the instruction is known to produce a scalar in SIMD registers, we can
26133 // duplicate it across the vector lanes using DUPLANE instead of moving it
26134 // to a GPR first. For example, this allows us to handle:
26135 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
26136 SDValue Op = N->getOperand(0);
26137 // FIXME: Ideally, we should be able to handle all instructions that
26138 // produce a scalar value in FPRs.
26139 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
26140 Op.getOpcode() == AArch64ISD::FCMGE ||
26141 Op.getOpcode() == AArch64ISD::FCMGT) {
26142 EVT ElemVT = VT.getVectorElementType();
26143 EVT ExpandedVT = VT;
26144 // Insert into a 128-bit vector to match DUPLANE's pattern.
26145 if (VT.getSizeInBits() != 128)
26146 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26147 128 / ElemVT.getSizeInBits());
26148 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
26149 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
26150 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
26151 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
26152 }
26153
26154 if (DCI.isAfterLegalizeDAG()) {
26155 // If scalar dup's operand is extract_vector_elt, try to combine them into
26156 // duplane. For example,
26157 //
26158 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
26159 // t18: v4i32 = AArch64ISD::DUP t21
26160 // ==>
26161 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
26162 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
26163 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26164 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
26165 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
26166 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
26167 EXTRACT_VEC_ELT.getOperand(1));
26168 }
26169 }
26170 }
26171
26172 return performPostLD1Combine(N, DCI, false);
26173 }
26174
26175 return SDValue();
26176}
26177
26178/// Get rid of unnecessary NVCASTs (that don't change the type).
26180 if (N->getValueType(0) == N->getOperand(0).getValueType())
26181 return N->getOperand(0);
26182 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
26183 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
26184 N->getOperand(0).getOperand(0));
26185
26186 return SDValue();
26187}
26188
26189// If all users of the globaladdr are of the form (globaladdr + constant), find
26190// the smallest constant, fold it into the globaladdr's offset and rewrite the
26191// globaladdr as (globaladdr + constant) - constant.
26193 const AArch64Subtarget *Subtarget,
26194 const TargetMachine &TM) {
26195 auto *GN = cast<GlobalAddressSDNode>(N);
26196 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
26198 return SDValue();
26199
26200 uint64_t MinOffset = -1ull;
26201 for (SDNode *N : GN->users()) {
26202 if (N->getOpcode() != ISD::ADD)
26203 return SDValue();
26204 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
26205 if (!C)
26206 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
26207 if (!C)
26208 return SDValue();
26209 MinOffset = std::min(MinOffset, C->getZExtValue());
26210 }
26211 uint64_t Offset = MinOffset + GN->getOffset();
26212
26213 // Require that the new offset is larger than the existing one. Otherwise, we
26214 // can end up oscillating between two possible DAGs, for example,
26215 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
26216 if (Offset <= uint64_t(GN->getOffset()))
26217 return SDValue();
26218
26219 // Check whether folding this offset is legal. It must not go out of bounds of
26220 // the referenced object to avoid violating the code model, and must be
26221 // smaller than 2^20 because this is the largest offset expressible in all
26222 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
26223 // stores an immediate signed 21 bit offset.)
26224 //
26225 // This check also prevents us from folding negative offsets, which will end
26226 // up being treated in the same way as large positive ones. They could also
26227 // cause code model violations, and aren't really common enough to matter.
26228 if (Offset >= (1 << 20))
26229 return SDValue();
26230
26231 const GlobalValue *GV = GN->getGlobal();
26232 Type *T = GV->getValueType();
26233 if (!T->isSized() ||
26235 return SDValue();
26236
26237 SDLoc DL(GN);
26238 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
26239 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
26240 DAG.getConstant(MinOffset, DL, MVT::i64));
26241}
26242
26244 const AArch64Subtarget *Subtarget) {
26245 SDValue BR = N->getOperand(0);
26246 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
26247 !BR.getValueType().isScalarInteger())
26248 return SDValue();
26249
26250 SDLoc DL(N);
26251 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
26252}
26253
26254// Turns the vector of indices into a vector of byte offstes by scaling Offset
26255// by (BitWidth / 8).
26257 SDLoc DL, unsigned BitWidth) {
26258 assert(Offset.getValueType().isScalableVector() &&
26259 "This method is only for scalable vectors of offsets");
26260
26261 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
26262 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
26263
26264 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
26265}
26266
26267/// Check if the value of \p OffsetInBytes can be used as an immediate for
26268/// the gather load/prefetch and scatter store instructions with vector base and
26269/// immediate offset addressing mode:
26270///
26271/// [<Zn>.[S|D]{, #<imm>}]
26272///
26273/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26274inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
26275 unsigned ScalarSizeInBytes) {
26276 // The immediate is not a multiple of the scalar size.
26277 if (OffsetInBytes % ScalarSizeInBytes)
26278 return false;
26279
26280 // The immediate is out of range.
26281 if (OffsetInBytes / ScalarSizeInBytes > 31)
26282 return false;
26283
26284 return true;
26285}
26286
26287/// Check if the value of \p Offset represents a valid immediate for the SVE
26288/// gather load/prefetch and scatter store instructiona with vector base and
26289/// immediate offset addressing mode:
26290///
26291/// [<Zn>.[S|D]{, #<imm>}]
26292///
26293/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26295 unsigned ScalarSizeInBytes) {
26296 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
26297 return OffsetConst && isValidImmForSVEVecImmAddrMode(
26298 OffsetConst->getZExtValue(), ScalarSizeInBytes);
26299}
26300
26302 unsigned Opcode,
26303 bool OnlyPackedOffsets = true) {
26304 const SDValue Src = N->getOperand(2);
26305 const EVT SrcVT = Src->getValueType(0);
26306 assert(SrcVT.isScalableVector() &&
26307 "Scatter stores are only possible for SVE vectors");
26308
26309 SDLoc DL(N);
26310 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
26311
26312 // Make sure that source data will fit into an SVE register
26314 return SDValue();
26315
26316 // For FPs, ACLE only supports _packed_ single and double precision types.
26317 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
26318 if (SrcElVT.isFloatingPoint())
26319 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
26320 ((Opcode != AArch64ISD::SST1Q_PRED &&
26321 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
26322 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
26323 return SDValue();
26324
26325 // Depending on the addressing mode, this is either a pointer or a vector of
26326 // pointers (that fits into one register)
26327 SDValue Base = N->getOperand(4);
26328 // Depending on the addressing mode, this is either a single offset or a
26329 // vector of offsets (that fits into one register)
26330 SDValue Offset = N->getOperand(5);
26331
26332 // For "scalar + vector of indices", just scale the indices. This only
26333 // applies to non-temporal scatters because there's no instruction that takes
26334 // indices.
26335 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
26336 Offset =
26338 Opcode = AArch64ISD::SSTNT1_PRED;
26339 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
26340 Offset =
26342 Opcode = AArch64ISD::SST1Q_PRED;
26343 }
26344
26345 // In the case of non-temporal gather loads there's only one SVE instruction
26346 // per data-size: "scalar + vector", i.e.
26347 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26348 // Since we do have intrinsics that allow the arguments to be in a different
26349 // order, we may need to swap them to match the spec.
26350 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
26351 Offset.getValueType().isVector())
26353
26354 // SST1_IMM requires that the offset is an immediate that is:
26355 // * a multiple of #SizeInBytes,
26356 // * in the range [0, 31 x #SizeInBytes],
26357 // where #SizeInBytes is the size in bytes of the stored items. For
26358 // immediates outside that range and non-immediate scalar offsets use SST1 or
26359 // SST1_UXTW instead.
26360 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
26362 SrcVT.getScalarSizeInBits() / 8)) {
26363 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26364 Opcode = AArch64ISD::SST1_UXTW_PRED;
26365 else
26366 Opcode = AArch64ISD::SST1_PRED;
26367
26369 }
26370 }
26371
26372 auto &TLI = DAG.getTargetLoweringInfo();
26373 if (!TLI.isTypeLegal(Base.getValueType()))
26374 return SDValue();
26375
26376 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
26377 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26378 // nxv2i64. Legalize accordingly.
26379 if (!OnlyPackedOffsets &&
26380 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26381 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26382
26383 if (!TLI.isTypeLegal(Offset.getValueType()))
26384 return SDValue();
26385
26386 // Source value type that is representable in hardware
26387 EVT HwSrcVt = getSVEContainerType(SrcVT);
26388
26389 // Keep the original type of the input data to store - this is needed to be
26390 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
26391 // FP values we want the integer equivalent, so just use HwSrcVt.
26392 SDValue InputVT = DAG.getValueType(SrcVT);
26393 if (SrcVT.isFloatingPoint())
26394 InputVT = DAG.getValueType(HwSrcVt);
26395
26396 SDVTList VTs = DAG.getVTList(MVT::Other);
26397 SDValue SrcNew;
26398
26399 if (Src.getValueType().isFloatingPoint())
26400 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
26401 else
26402 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
26403
26404 SDValue Ops[] = {N->getOperand(0), // Chain
26405 SrcNew,
26406 N->getOperand(3), // Pg
26407 Base,
26408 Offset,
26409 InputVT};
26410
26411 return DAG.getNode(Opcode, DL, VTs, Ops);
26412}
26413
26415 unsigned Opcode,
26416 bool OnlyPackedOffsets = true) {
26417 const EVT RetVT = N->getValueType(0);
26418 assert(RetVT.isScalableVector() &&
26419 "Gather loads are only possible for SVE vectors");
26420
26421 SDLoc DL(N);
26422
26423 // Make sure that the loaded data will fit into an SVE register
26425 return SDValue();
26426
26427 // Depending on the addressing mode, this is either a pointer or a vector of
26428 // pointers (that fits into one register)
26429 SDValue Base = N->getOperand(3);
26430 // Depending on the addressing mode, this is either a single offset or a
26431 // vector of offsets (that fits into one register)
26432 SDValue Offset = N->getOperand(4);
26433
26434 // For "scalar + vector of indices", scale the indices to obtain unscaled
26435 // offsets. This applies to non-temporal and quadword gathers, which do not
26436 // have an addressing mode with scaled offset.
26437 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
26439 RetVT.getScalarSizeInBits());
26440 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
26441 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
26443 RetVT.getScalarSizeInBits());
26444 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
26445 }
26446
26447 // In the case of non-temporal gather loads and quadword gather loads there's
26448 // only one addressing mode : "vector + scalar", e.g.
26449 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26450 // Since we do have intrinsics that allow the arguments to be in a different
26451 // order, we may need to swap them to match the spec.
26452 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
26453 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
26454 Offset.getValueType().isVector())
26456
26457 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26458 // * a multiple of #SizeInBytes,
26459 // * in the range [0, 31 x #SizeInBytes],
26460 // where #SizeInBytes is the size in bytes of the loaded items. For
26461 // immediates outside that range and non-immediate scalar offsets use
26462 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26463 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26464 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26466 RetVT.getScalarSizeInBits() / 8)) {
26467 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26468 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26469 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26470 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26471 else
26472 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26473 ? AArch64ISD::GLD1_MERGE_ZERO
26474 : AArch64ISD::GLDFF1_MERGE_ZERO;
26475
26477 }
26478 }
26479
26480 auto &TLI = DAG.getTargetLoweringInfo();
26481 if (!TLI.isTypeLegal(Base.getValueType()))
26482 return SDValue();
26483
26484 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26485 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26486 // nxv2i64. Legalize accordingly.
26487 if (!OnlyPackedOffsets &&
26488 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26489 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26490
26491 // Return value type that is representable in hardware
26492 EVT HwRetVt = getSVEContainerType(RetVT);
26493
26494 // Keep the original output value type around - this is needed to be able to
26495 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26496 // values we want the integer equivalent, so just use HwRetVT.
26497 SDValue OutVT = DAG.getValueType(RetVT);
26498 if (RetVT.isFloatingPoint())
26499 OutVT = DAG.getValueType(HwRetVt);
26500
26501 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
26502 SDValue Ops[] = {N->getOperand(0), // Chain
26503 N->getOperand(2), // Pg
26504 Base, Offset, OutVT};
26505
26506 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
26507 SDValue LoadChain = SDValue(Load.getNode(), 1);
26508
26509 if (RetVT.isInteger() && (RetVT != HwRetVt))
26510 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
26511
26512 // If the original return value was FP, bitcast accordingly. Doing it here
26513 // means that we can avoid adding TableGen patterns for FPs.
26514 if (RetVT.isFloatingPoint())
26515 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
26516
26517 return DAG.getMergeValues({Load, LoadChain}, DL);
26518}
26519
26520static SDValue
26522 SelectionDAG &DAG) {
26523 SDLoc DL(N);
26524 SDValue Src = N->getOperand(0);
26525 unsigned Opc = Src->getOpcode();
26526
26527 // Sign extend of an unsigned unpack -> signed unpack
26528 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
26529
26530 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
26531 : AArch64ISD::SUNPKLO;
26532
26533 // Push the sign extend to the operand of the unpack
26534 // This is necessary where, for example, the operand of the unpack
26535 // is another unpack:
26536 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
26537 // ->
26538 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
26539 // ->
26540 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
26541 SDValue ExtOp = Src->getOperand(0);
26542 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
26543 EVT EltTy = VT.getVectorElementType();
26544 (void)EltTy;
26545
26546 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
26547 "Sign extending from an invalid type");
26548
26549 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
26550
26552 ExtOp, DAG.getValueType(ExtVT));
26553
26554 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
26555 }
26556
26557 if (DCI.isBeforeLegalizeOps())
26558 return SDValue();
26559
26561 return SDValue();
26562
26563 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
26564 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
26565 unsigned NewOpc;
26566 unsigned MemVTOpNum = 4;
26567 switch (Opc) {
26568 case AArch64ISD::LD1_MERGE_ZERO:
26569 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
26570 MemVTOpNum = 3;
26571 break;
26572 case AArch64ISD::LDNF1_MERGE_ZERO:
26573 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
26574 MemVTOpNum = 3;
26575 break;
26576 case AArch64ISD::LDFF1_MERGE_ZERO:
26577 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
26578 MemVTOpNum = 3;
26579 break;
26580 case AArch64ISD::GLD1_MERGE_ZERO:
26581 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
26582 break;
26583 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26584 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
26585 break;
26586 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26587 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
26588 break;
26589 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26590 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
26591 break;
26592 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26593 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
26594 break;
26595 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26596 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
26597 break;
26598 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26599 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
26600 break;
26601 case AArch64ISD::GLDFF1_MERGE_ZERO:
26602 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
26603 break;
26604 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
26605 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
26606 break;
26607 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
26608 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
26609 break;
26610 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
26611 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
26612 break;
26613 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
26614 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
26615 break;
26616 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
26617 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
26618 break;
26619 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
26620 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
26621 break;
26622 case AArch64ISD::GLDNT1_MERGE_ZERO:
26623 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
26624 break;
26625 default:
26626 return SDValue();
26627 }
26628
26629 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
26630 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
26631
26632 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
26633 return SDValue();
26634
26635 EVT DstVT = N->getValueType(0);
26636 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
26637
26639 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
26640 Ops.push_back(Src->getOperand(I));
26641
26642 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
26643 DCI.CombineTo(N, ExtLoad);
26644 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
26645
26646 // Return N so it doesn't get rechecked
26647 return SDValue(N, 0);
26648}
26649
26650/// Legalize the gather prefetch (scalar + vector addressing mode) when the
26651/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
26652/// != nxv2i32) do not need legalization.
26654 const unsigned OffsetPos = 4;
26655 SDValue Offset = N->getOperand(OffsetPos);
26656
26657 // Not an unpacked vector, bail out.
26658 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
26659 return SDValue();
26660
26661 // Extend the unpacked offset vector to 64-bit lanes.
26662 SDLoc DL(N);
26663 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
26664 SmallVector<SDValue, 5> Ops(N->ops());
26665 // Replace the offset operand with the 64-bit one.
26666 Ops[OffsetPos] = Offset;
26667
26668 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
26669}
26670
26671/// Combines a node carrying the intrinsic
26672/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
26673/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
26674/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
26675/// sve gather prefetch instruction with vector plus immediate addressing mode.
26677 unsigned ScalarSizeInBytes) {
26678 const unsigned ImmPos = 4, OffsetPos = 3;
26679 // No need to combine the node if the immediate is valid...
26680 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
26681 return SDValue();
26682
26683 // ...otherwise swap the offset base with the offset...
26684 SmallVector<SDValue, 5> Ops(N->ops());
26685 std::swap(Ops[ImmPos], Ops[OffsetPos]);
26686 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
26687 // `aarch64_sve_prfb_gather_uxtw_index`.
26688 SDLoc DL(N);
26689 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
26690 MVT::i64);
26691
26692 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
26693}
26694
26695// Return true if the vector operation can guarantee only the first lane of its
26696// result contains data, with all bits in other lanes set to zero.
26698 switch (Op.getOpcode()) {
26699 default:
26700 return false;
26701 case AArch64ISD::ANDV_PRED:
26702 case AArch64ISD::EORV_PRED:
26703 case AArch64ISD::FADDA_PRED:
26704 case AArch64ISD::FADDV_PRED:
26705 case AArch64ISD::FMAXNMV_PRED:
26706 case AArch64ISD::FMAXV_PRED:
26707 case AArch64ISD::FMINNMV_PRED:
26708 case AArch64ISD::FMINV_PRED:
26709 case AArch64ISD::ORV_PRED:
26710 case AArch64ISD::SADDV_PRED:
26711 case AArch64ISD::SMAXV_PRED:
26712 case AArch64ISD::SMINV_PRED:
26713 case AArch64ISD::UADDV_PRED:
26714 case AArch64ISD::UMAXV_PRED:
26715 case AArch64ISD::UMINV_PRED:
26716 return true;
26717 }
26718}
26719
26721 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26722 SDValue InsertVec = N->getOperand(0);
26723 SDValue InsertElt = N->getOperand(1);
26724 SDValue InsertIdx = N->getOperand(2);
26725
26726 // We only care about inserts into the first element...
26727 if (!isNullConstant(InsertIdx))
26728 return SDValue();
26729 // ...of a zero'd vector...
26731 return SDValue();
26732 // ...where the inserted data was previously extracted...
26733 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26734 return SDValue();
26735
26736 SDValue ExtractVec = InsertElt.getOperand(0);
26737 SDValue ExtractIdx = InsertElt.getOperand(1);
26738
26739 // ...from the first element of a vector.
26740 if (!isNullConstant(ExtractIdx))
26741 return SDValue();
26742
26743 // If we get here we are effectively trying to zero lanes 1-N of a vector.
26744
26745 // Ensure there's no type conversion going on.
26746 if (N->getValueType(0) != ExtractVec.getValueType())
26747 return SDValue();
26748
26749 if (!isLanes1toNKnownZero(ExtractVec))
26750 return SDValue();
26751
26752 // The explicit zeroing is redundant.
26753 return ExtractVec;
26754}
26755
26756static SDValue
26759 return Res;
26760
26761 return performPostLD1Combine(N, DCI, true);
26762}
26763
26766 const AArch64Subtarget *Subtarget) {
26767 SDValue N0 = N->getOperand(0);
26768 EVT VT = N->getValueType(0);
26769
26770 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26771 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26772 return SDValue();
26773
26774 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26775 EVT EltVT = VT.getVectorElementType();
26776 return EltVT == MVT::f32 || EltVT == MVT::f64;
26777 };
26778
26779 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26780 // We purposefully don't care about legality of the nodes here as we know
26781 // they can be split down into something legal.
26782 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
26783 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26784 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
26785 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26786 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
26787 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
26788 LN0->getChain(), LN0->getBasePtr(),
26789 N0.getValueType(), LN0->getMemOperand());
26790 DCI.CombineTo(N, ExtLoad);
26791 DCI.CombineTo(
26792 N0.getNode(),
26793 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
26794 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
26795 ExtLoad.getValue(1));
26796 return SDValue(N, 0); // Return N so it doesn't get rechecked!
26797 }
26798
26799 return SDValue();
26800}
26801
26803 const AArch64Subtarget *Subtarget) {
26804 EVT VT = N->getValueType(0);
26805
26806 // Don't expand for NEON, SVE2 or SME
26807 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
26808 return SDValue();
26809
26810 SDLoc DL(N);
26811
26812 SDValue Mask = N->getOperand(0);
26813 SDValue In1 = N->getOperand(1);
26814 SDValue In2 = N->getOperand(2);
26815
26816 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
26817 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
26818 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
26819 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
26820}
26821
26823 EVT VT = N->getValueType(0);
26824
26825 SDValue Insert = N->getOperand(0);
26826 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26827 return SDValue();
26828
26829 if (!Insert.getOperand(0).isUndef())
26830 return SDValue();
26831
26832 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
26833 uint64_t IdxDupLane = N->getConstantOperandVal(1);
26834 if (IdxInsert != 0 || IdxDupLane != 0)
26835 return SDValue();
26836
26837 SDValue Bitcast = Insert.getOperand(1);
26838 if (Bitcast.getOpcode() != ISD::BITCAST)
26839 return SDValue();
26840
26841 SDValue Subvec = Bitcast.getOperand(0);
26842 EVT SubvecVT = Subvec.getValueType();
26843 if (!SubvecVT.is128BitVector())
26844 return SDValue();
26845 EVT NewSubvecVT =
26847
26848 SDLoc DL(N);
26849 SDValue NewInsert =
26850 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
26851 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
26852 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
26853 NewInsert, N->getOperand(1));
26854 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
26855}
26856
26857// Try to combine mull with uzp1.
26860 SelectionDAG &DAG) {
26861 if (DCI.isBeforeLegalizeOps())
26862 return SDValue();
26863
26864 SDValue LHS = N->getOperand(0);
26865 SDValue RHS = N->getOperand(1);
26866
26867 SDValue ExtractHigh;
26868 SDValue ExtractLow;
26869 SDValue TruncHigh;
26870 SDValue TruncLow;
26871 SDLoc DL(N);
26872
26873 // Check the operands are trunc and extract_high.
26875 RHS.getOpcode() == ISD::TRUNCATE) {
26876 TruncHigh = RHS;
26877 if (LHS.getOpcode() == ISD::BITCAST)
26878 ExtractHigh = LHS.getOperand(0);
26879 else
26880 ExtractHigh = LHS;
26882 LHS.getOpcode() == ISD::TRUNCATE) {
26883 TruncHigh = LHS;
26884 if (RHS.getOpcode() == ISD::BITCAST)
26885 ExtractHigh = RHS.getOperand(0);
26886 else
26887 ExtractHigh = RHS;
26888 } else
26889 return SDValue();
26890
26891 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26892 // with uzp1.
26893 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26894 SDValue TruncHighOp = TruncHigh.getOperand(0);
26895 EVT TruncHighOpVT = TruncHighOp.getValueType();
26896 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
26897 DAG.isSplatValue(TruncHighOp, false))
26898 return SDValue();
26899
26900 // Check there is other extract_high with same source vector.
26901 // For example,
26902 //
26903 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
26904 // t12: v4i16 = truncate t11
26905 // t31: v4i32 = AArch64ISD::SMULL t18, t12
26906 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
26907 // t16: v4i16 = truncate t15
26908 // t30: v4i32 = AArch64ISD::SMULL t23, t1
26909 //
26910 // This dagcombine assumes the two extract_high uses same source vector in
26911 // order to detect the pair of the mull. If they have different source vector,
26912 // this code will not work.
26913 // TODO: Should also try to look through a bitcast.
26914 bool HasFoundMULLow = true;
26915 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
26916 if (ExtractHighSrcVec->use_size() != 2)
26917 HasFoundMULLow = false;
26918
26919 // Find ExtractLow.
26920 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
26921 if (User == ExtractHigh.getNode())
26922 continue;
26923
26924 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26926 HasFoundMULLow = false;
26927 break;
26928 }
26929
26930 ExtractLow.setNode(User);
26931 }
26932
26933 if (!ExtractLow || !ExtractLow->hasOneUse())
26934 HasFoundMULLow = false;
26935
26936 // Check ExtractLow's user.
26937 if (HasFoundMULLow) {
26938 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
26939 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
26940 HasFoundMULLow = false;
26941 } else {
26942 if (ExtractLowUser->getOperand(0) == ExtractLow) {
26943 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
26944 TruncLow = ExtractLowUser->getOperand(1);
26945 else
26946 HasFoundMULLow = false;
26947 } else {
26948 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
26949 TruncLow = ExtractLowUser->getOperand(0);
26950 else
26951 HasFoundMULLow = false;
26952 }
26953 }
26954 }
26955
26956 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26957 // with uzp1.
26958 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26959 EVT TruncHighVT = TruncHigh.getValueType();
26960 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
26961 SDValue TruncLowOp =
26962 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
26963 EVT TruncLowOpVT = TruncLowOp.getValueType();
26964 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
26965 DAG.isSplatValue(TruncLowOp, false)))
26966 return SDValue();
26967
26968 // Create uzp1, extract_high and extract_low.
26969 if (TruncHighOpVT != UZP1VT)
26970 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
26971 if (TruncLowOpVT != UZP1VT)
26972 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
26973
26974 SDValue UZP1 =
26975 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
26976 SDValue HighIdxCst =
26977 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
26978 SDValue NewTruncHigh =
26979 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
26980 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
26981
26982 if (HasFoundMULLow) {
26983 EVT TruncLowVT = TruncLow.getValueType();
26984 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
26985 UZP1, ExtractLow.getOperand(1));
26986 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
26987 }
26988
26989 return SDValue(N, 0);
26990}
26991
26994 SelectionDAG &DAG) {
26995 if (SDValue Val =
26997 return Val;
26998
26999 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
27000 return Val;
27001
27002 return SDValue();
27003}
27004
27005static SDValue
27007 SelectionDAG &DAG) {
27008 // Let's do below transform.
27009 //
27010 // t34: v4i32 = AArch64ISD::UADDLV t2
27011 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
27012 // t7: i64 = zero_extend t35
27013 // t20: v1i64 = scalar_to_vector t7
27014 // ==>
27015 // t34: v4i32 = AArch64ISD::UADDLV t2
27016 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
27017 // t40: v1i64 = AArch64ISD::NVCAST t39
27018 if (DCI.isBeforeLegalizeOps())
27019 return SDValue();
27020
27021 EVT VT = N->getValueType(0);
27022 if (VT != MVT::v1i64)
27023 return SDValue();
27024
27025 SDValue ZEXT = N->getOperand(0);
27026 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
27027 return SDValue();
27028
27029 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
27030 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
27031 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
27032 return SDValue();
27033
27034 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
27035 return SDValue();
27036
27037 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
27038 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
27039 UADDLV.getValueType() != MVT::v4i32 ||
27040 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
27041 return SDValue();
27042
27043 // Let's generate new sequence with AArch64ISD::NVCAST.
27044 SDLoc DL(N);
27045 SDValue EXTRACT_SUBVEC =
27046 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
27047 DAG.getConstant(0, DL, MVT::i64));
27048 SDValue NVCAST =
27049 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
27050
27051 return NVCAST;
27052}
27053
27054/// If the operand is a bitwise AND with a constant RHS, and the shift has a
27055/// constant RHS and is the only use, we can pull it out of the shift, i.e.
27056///
27057/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
27058///
27059/// We prefer this canonical form to match existing isel patterns.
27062 SelectionDAG &DAG) {
27063 if (DCI.isBeforeLegalizeOps())
27064 return SDValue();
27065
27066 SDValue Op0 = N->getOperand(0);
27067 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
27068 return SDValue();
27069
27070 SDValue C1 = Op0->getOperand(1);
27071 SDValue C2 = N->getOperand(1);
27072 if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
27073 return SDValue();
27074
27075 // Might be folded into shifted op, do not lower.
27076 if (N->hasOneUse()) {
27077 unsigned UseOpc = N->user_begin()->getOpcode();
27078 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
27079 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
27080 return SDValue();
27081 }
27082
27083 SDLoc DL(N);
27084 EVT VT = N->getValueType(0);
27085
27086 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
27087 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
27088 // causing infinite loop. Result may also be worse.
27089 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
27090 if (!isa<ConstantSDNode>(NewRHS))
27091 return SDValue();
27092
27093 SDValue X = Op0->getOperand(0);
27094 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
27095 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
27096}
27097
27099 unsigned IntrinsicID = N->getConstantOperandVal(1);
27100 auto Register =
27101 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
27102 : AArch64SysReg::RNDRRS);
27103 SDLoc DL(N);
27104 SDValue A = DAG.getNode(
27105 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
27106 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
27107 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
27108 DAG.getConstant(0, DL, MVT::i32),
27109 DAG.getConstant(0, DL, MVT::i32),
27110 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
27111 return DAG.getMergeValues(
27112 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
27113}
27114
27116 DAGCombinerInfo &DCI) const {
27117 SelectionDAG &DAG = DCI.DAG;
27118 switch (N->getOpcode()) {
27119 default:
27120 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
27121 break;
27122 case ISD::VECREDUCE_AND:
27123 case ISD::VECREDUCE_OR:
27124 case ISD::VECREDUCE_XOR:
27125 return performVecReduceBitwiseCombine(N, DCI, DAG);
27126 case ISD::ADD:
27127 case ISD::SUB:
27128 return performAddSubCombine(N, DCI);
27129 case ISD::BUILD_VECTOR:
27130 return performBuildVectorCombine(N, DCI, DAG);
27131 case ISD::SMIN:
27132 return performSMINCombine(N, DAG);
27133 case ISD::TRUNCATE:
27134 return performTruncateCombine(N, DAG, DCI);
27135 case AArch64ISD::ANDS:
27136 return performFlagSettingCombine(N, DCI, ISD::AND);
27137 case AArch64ISD::ADC:
27138 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27139 return R;
27140 return foldADCToCINC(N, DAG);
27141 case AArch64ISD::SBC:
27142 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
27143 case AArch64ISD::ADCS:
27144 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27145 return R;
27146 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
27147 case AArch64ISD::SBCS:
27148 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
27149 return R;
27150 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
27151 case AArch64ISD::BICi: {
27153 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
27154 APInt DemandedElts =
27155 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
27156
27158 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
27159 return SDValue();
27160
27161 break;
27162 }
27163 case ISD::XOR:
27164 return performXorCombine(N, DAG, DCI, Subtarget);
27165 case ISD::MUL:
27166 return performMulCombine(N, DAG, DCI, Subtarget);
27167 case ISD::SINT_TO_FP:
27168 case ISD::UINT_TO_FP:
27169 return performIntToFpCombine(N, DAG, DCI, Subtarget);
27170 case ISD::FP_TO_SINT:
27171 case ISD::FP_TO_UINT:
27174 return performFpToIntCombine(N, DAG, DCI, Subtarget);
27175 case ISD::OR:
27176 return performORCombine(N, DCI, Subtarget, *this);
27177 case ISD::AND:
27178 return performANDCombine(N, DCI);
27179 case ISD::FADD:
27180 return performFADDCombine(N, DCI);
27182 return performIntrinsicCombine(N, DCI, Subtarget);
27183 case ISD::ANY_EXTEND:
27184 case ISD::ZERO_EXTEND:
27185 case ISD::SIGN_EXTEND:
27186 return performExtendCombine(N, DCI, DAG);
27188 return performSignExtendInRegCombine(N, DCI, DAG);
27190 return performConcatVectorsCombine(N, DCI, DAG);
27192 return performExtractSubvectorCombine(N, DCI, DAG);
27194 return performInsertSubvectorCombine(N, DCI, DAG);
27195 case ISD::SELECT:
27196 return performSelectCombine(N, DCI);
27197 case ISD::VSELECT:
27198 return performVSelectCombine(N, DCI.DAG);
27199 case ISD::SETCC:
27200 return performSETCCCombine(N, DCI, DAG);
27201 case ISD::LOAD:
27202 return performLOADCombine(N, DCI, DAG, Subtarget);
27203 case ISD::STORE:
27204 return performSTORECombine(N, DCI, DAG, Subtarget);
27205 case ISD::MSTORE:
27206 return performMSTORECombine(N, DCI, DAG, Subtarget);
27207 case ISD::MGATHER:
27208 case ISD::MSCATTER:
27210 return performMaskedGatherScatterCombine(N, DCI, DAG);
27211 case ISD::FP_EXTEND:
27212 return performFPExtendCombine(N, DAG, DCI, Subtarget);
27213 case AArch64ISD::BRCOND:
27214 return performBRCONDCombine(N, DCI, DAG);
27215 case AArch64ISD::TBNZ:
27216 case AArch64ISD::TBZ:
27217 return performTBZCombine(N, DCI, DAG);
27218 case AArch64ISD::CSEL:
27219 return performCSELCombine(N, DCI, DAG);
27220 case AArch64ISD::DUP:
27221 case AArch64ISD::DUPLANE8:
27222 case AArch64ISD::DUPLANE16:
27223 case AArch64ISD::DUPLANE32:
27224 case AArch64ISD::DUPLANE64:
27225 return performDUPCombine(N, DCI);
27226 case AArch64ISD::DUPLANE128:
27227 return performDupLane128Combine(N, DAG);
27228 case AArch64ISD::NVCAST:
27229 return performNVCASTCombine(N, DAG);
27230 case AArch64ISD::SPLICE:
27231 return performSpliceCombine(N, DAG);
27232 case AArch64ISD::UUNPKLO:
27233 case AArch64ISD::UUNPKHI:
27234 return performUnpackCombine(N, DAG, Subtarget);
27235 case AArch64ISD::UZP1:
27236 case AArch64ISD::UZP2:
27237 return performUzpCombine(N, DAG, Subtarget);
27238 case AArch64ISD::SETCC_MERGE_ZERO:
27239 return performSetccMergeZeroCombine(N, DCI);
27240 case AArch64ISD::REINTERPRET_CAST:
27242 case AArch64ISD::GLD1_MERGE_ZERO:
27243 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27244 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27245 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27246 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27247 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27248 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27249 case AArch64ISD::GLD1S_MERGE_ZERO:
27250 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
27251 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
27252 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
27253 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
27254 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
27255 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
27256 return performGLD1Combine(N, DAG);
27257 case AArch64ISD::VASHR:
27258 case AArch64ISD::VLSHR:
27259 return performVectorShiftCombine(N, *this, DCI);
27260 case AArch64ISD::SUNPKLO:
27261 return performSunpkloCombine(N, DAG);
27262 case AArch64ISD::BSP:
27263 return performBSPExpandForSVE(N, DAG, Subtarget);
27265 return performInsertVectorEltCombine(N, DCI);
27267 return performExtractVectorEltCombine(N, DCI, Subtarget);
27268 case ISD::VECREDUCE_ADD:
27269 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
27271 return performActiveLaneMaskCombine(N, DCI, Subtarget);
27272 case AArch64ISD::UADDV:
27273 return performUADDVCombine(N, DAG);
27274 case AArch64ISD::SMULL:
27275 case AArch64ISD::UMULL:
27276 case AArch64ISD::PMULL:
27277 return performMULLCombine(N, DCI, DAG);
27280 switch (N->getConstantOperandVal(1)) {
27281 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
27282 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
27283 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
27284 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
27285 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
27286 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
27287 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
27288 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
27289 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
27290 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
27291 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
27292 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
27293 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
27294 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
27295 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
27296 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
27298 case Intrinsic::aarch64_neon_ld2:
27299 case Intrinsic::aarch64_neon_ld3:
27300 case Intrinsic::aarch64_neon_ld4:
27301 case Intrinsic::aarch64_neon_ld1x2:
27302 case Intrinsic::aarch64_neon_ld1x3:
27303 case Intrinsic::aarch64_neon_ld1x4:
27304 case Intrinsic::aarch64_neon_ld2lane:
27305 case Intrinsic::aarch64_neon_ld3lane:
27306 case Intrinsic::aarch64_neon_ld4lane:
27307 case Intrinsic::aarch64_neon_ld2r:
27308 case Intrinsic::aarch64_neon_ld3r:
27309 case Intrinsic::aarch64_neon_ld4r:
27310 case Intrinsic::aarch64_neon_st2:
27311 case Intrinsic::aarch64_neon_st3:
27312 case Intrinsic::aarch64_neon_st4:
27313 case Intrinsic::aarch64_neon_st1x2:
27314 case Intrinsic::aarch64_neon_st1x3:
27315 case Intrinsic::aarch64_neon_st1x4:
27316 case Intrinsic::aarch64_neon_st2lane:
27317 case Intrinsic::aarch64_neon_st3lane:
27318 case Intrinsic::aarch64_neon_st4lane:
27319 return performNEONPostLDSTCombine(N, DCI, DAG);
27320 case Intrinsic::aarch64_sve_ldnt1:
27321 return performLDNT1Combine(N, DAG);
27322 case Intrinsic::aarch64_sve_ld1rq:
27323 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
27324 case Intrinsic::aarch64_sve_ld1ro:
27325 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
27326 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
27327 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27328 case Intrinsic::aarch64_sve_ldnt1_gather:
27329 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27330 case Intrinsic::aarch64_sve_ldnt1_gather_index:
27331 return performGatherLoadCombine(N, DAG,
27332 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
27333 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
27334 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27335 case Intrinsic::aarch64_sve_ld1:
27336 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
27337 case Intrinsic::aarch64_sve_ldnf1:
27338 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
27339 case Intrinsic::aarch64_sve_ldff1:
27340 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
27341 case Intrinsic::aarch64_sve_st1:
27342 return performST1Combine(N, DAG);
27343 case Intrinsic::aarch64_sve_stnt1:
27344 return performSTNT1Combine(N, DAG);
27345 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
27346 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27347 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
27348 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27349 case Intrinsic::aarch64_sve_stnt1_scatter:
27350 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27351 case Intrinsic::aarch64_sve_stnt1_scatter_index:
27352 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
27353 case Intrinsic::aarch64_sve_ld1_gather:
27354 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
27355 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
27356 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
27357 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
27358 case Intrinsic::aarch64_sve_ld1q_gather_index:
27359 return performGatherLoadCombine(N, DAG,
27360 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
27361 case Intrinsic::aarch64_sve_ld1_gather_index:
27362 return performGatherLoadCombine(N, DAG,
27363 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
27364 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
27365 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
27366 /*OnlyPackedOffsets=*/false);
27367 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
27368 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
27369 /*OnlyPackedOffsets=*/false);
27370 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
27371 return performGatherLoadCombine(N, DAG,
27372 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
27373 /*OnlyPackedOffsets=*/false);
27374 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
27375 return performGatherLoadCombine(N, DAG,
27376 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
27377 /*OnlyPackedOffsets=*/false);
27378 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
27379 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
27380 case Intrinsic::aarch64_sve_ldff1_gather:
27381 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
27382 case Intrinsic::aarch64_sve_ldff1_gather_index:
27383 return performGatherLoadCombine(N, DAG,
27384 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
27385 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
27386 return performGatherLoadCombine(N, DAG,
27387 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
27388 /*OnlyPackedOffsets=*/false);
27389 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
27390 return performGatherLoadCombine(N, DAG,
27391 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
27392 /*OnlyPackedOffsets=*/false);
27393 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
27394 return performGatherLoadCombine(N, DAG,
27395 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
27396 /*OnlyPackedOffsets=*/false);
27397 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
27398 return performGatherLoadCombine(N, DAG,
27399 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
27400 /*OnlyPackedOffsets=*/false);
27401 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
27402 return performGatherLoadCombine(N, DAG,
27403 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
27404 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
27405 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
27406 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
27407 case Intrinsic::aarch64_sve_st1q_scatter_index:
27408 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
27409 case Intrinsic::aarch64_sve_st1_scatter:
27410 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
27411 case Intrinsic::aarch64_sve_st1_scatter_index:
27412 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
27413 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
27414 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
27415 /*OnlyPackedOffsets=*/false);
27416 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
27417 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
27418 /*OnlyPackedOffsets=*/false);
27419 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
27420 return performScatterStoreCombine(N, DAG,
27421 AArch64ISD::SST1_SXTW_SCALED_PRED,
27422 /*OnlyPackedOffsets=*/false);
27423 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
27424 return performScatterStoreCombine(N, DAG,
27425 AArch64ISD::SST1_UXTW_SCALED_PRED,
27426 /*OnlyPackedOffsets=*/false);
27427 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
27428 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
27429 case Intrinsic::aarch64_rndr:
27430 case Intrinsic::aarch64_rndrrs:
27431 return performRNDRCombine(N, DAG);
27432 case Intrinsic::aarch64_sme_ldr_zt:
27433 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
27434 DAG.getVTList(MVT::Other), N->getOperand(0),
27435 N->getOperand(2), N->getOperand(3));
27436 case Intrinsic::aarch64_sme_str_zt:
27437 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
27438 DAG.getVTList(MVT::Other), N->getOperand(0),
27439 N->getOperand(2), N->getOperand(3));
27440 default:
27441 break;
27442 }
27443 break;
27444 case ISD::GlobalAddress:
27445 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
27446 case ISD::CTLZ:
27447 return performCTLZCombine(N, DAG, Subtarget);
27449 return performScalarToVectorCombine(N, DCI, DAG);
27450 case ISD::SHL:
27451 return performSHLCombine(N, DCI, DAG);
27452 }
27453 return SDValue();
27454}
27455
27456// Check if the return value is used as only a return value, as otherwise
27457// we can't perform a tail-call. In particular, we need to check for
27458// target ISD nodes that are returns and any other "odd" constructs
27459// that the generic analysis code won't necessarily catch.
27460bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
27461 SDValue &Chain) const {
27462 if (N->getNumValues() != 1)
27463 return false;
27464 if (!N->hasNUsesOfValue(1, 0))
27465 return false;
27466
27467 SDValue TCChain = Chain;
27468 SDNode *Copy = *N->user_begin();
27469 if (Copy->getOpcode() == ISD::CopyToReg) {
27470 // If the copy has a glue operand, we conservatively assume it isn't safe to
27471 // perform a tail call.
27472 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
27473 MVT::Glue)
27474 return false;
27475 TCChain = Copy->getOperand(0);
27476 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
27477 return false;
27478
27479 bool HasRet = false;
27480 for (SDNode *Node : Copy->users()) {
27481 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
27482 return false;
27483 HasRet = true;
27484 }
27485
27486 if (!HasRet)
27487 return false;
27488
27489 Chain = TCChain;
27490 return true;
27491}
27492
27493// Return whether the an instruction can potentially be optimized to a tail
27494// call. This will cause the optimizers to attempt to move, or duplicate,
27495// return instructions to help enable tail call optimizations for this
27496// instruction.
27497bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
27498 return CI->isTailCall();
27499}
27500
27501bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
27502 Register Offset, bool IsPre,
27503 MachineRegisterInfo &MRI) const {
27504 auto CstOffset = getIConstantVRegVal(Offset, MRI);
27505 if (!CstOffset || CstOffset->isZero())
27506 return false;
27507
27508 // All of the indexed addressing mode instructions take a signed 9 bit
27509 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
27510 // encodes the sign/indexing direction.
27511 return isInt<9>(CstOffset->getSExtValue());
27512}
27513
27514bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
27515 SDValue &Base,
27516 SDValue &Offset,
27517 SelectionDAG &DAG) const {
27518 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
27519 return false;
27520
27521 // Non-null if there is exactly one user of the loaded value (ignoring chain).
27522 SDNode *ValOnlyUser = nullptr;
27523 for (SDUse &U : N->uses()) {
27524 if (U.getResNo() == 1)
27525 continue; // Ignore chain.
27526 if (ValOnlyUser == nullptr)
27527 ValOnlyUser = U.getUser();
27528 else {
27529 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
27530 break;
27531 }
27532 }
27533
27534 auto IsUndefOrZero = [](SDValue V) {
27535 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
27536 };
27537
27538 // If the only user of the value is a scalable vector splat, it is
27539 // preferable to do a replicating load (ld1r*).
27540 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
27541 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
27542 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
27543 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
27544 return false;
27545
27546 Base = Op->getOperand(0);
27547 // All of the indexed addressing mode instructions take a signed
27548 // 9 bit immediate offset.
27549 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
27550 int64_t RHSC = RHS->getSExtValue();
27551 if (Op->getOpcode() == ISD::SUB)
27552 RHSC = -(uint64_t)RHSC;
27553 if (!isInt<9>(RHSC))
27554 return false;
27555 // When big-endian VLD1/VST1 are used for vector load and store, and these
27556 // only allow an offset that's equal to the store size.
27557 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
27558 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
27559 (uint64_t)RHSC != MemType.getStoreSize())
27560 return false;
27561 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
27562 // when dealing with subtraction.
27563 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
27564 return true;
27565 }
27566 return false;
27567}
27568
27569bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
27570 SDValue &Offset,
27572 SelectionDAG &DAG) const {
27573 EVT VT;
27574 SDValue Ptr;
27575 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
27576 VT = LD->getMemoryVT();
27577 Ptr = LD->getBasePtr();
27578 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
27579 VT = ST->getMemoryVT();
27580 Ptr = ST->getBasePtr();
27581 } else
27582 return false;
27583
27584 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
27585 return false;
27586 AM = ISD::PRE_INC;
27587 return true;
27588}
27589
27590bool AArch64TargetLowering::getPostIndexedAddressParts(
27592 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
27593 EVT VT;
27594 SDValue Ptr;
27595 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
27596 VT = LD->getMemoryVT();
27597 Ptr = LD->getBasePtr();
27598 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
27599 VT = ST->getMemoryVT();
27600 Ptr = ST->getBasePtr();
27601 } else
27602 return false;
27603
27604 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
27605 return false;
27606 // Post-indexing updates the base, so it's not a valid transform
27607 // if that's not the same as the load's pointer.
27608 if (Ptr != Base)
27609 return false;
27610 AM = ISD::POST_INC;
27611 return true;
27612}
27613
27616 SelectionDAG &DAG) {
27617 SDLoc DL(N);
27618 SDValue Op = N->getOperand(0);
27619 EVT VT = N->getValueType(0);
27620 [[maybe_unused]] EVT SrcVT = Op.getValueType();
27621 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27622 "Must be bool vector.");
27623
27624 // Special handling for Clang's __builtin_convertvector. For vectors with <8
27625 // elements, it adds a vector concatenation with undef(s). If we encounter
27626 // this here, we can skip the concat.
27627 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
27628 bool AllUndef = true;
27629 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
27630 AllUndef &= Op.getOperand(I).isUndef();
27631
27632 if (AllUndef)
27633 Op = Op.getOperand(0);
27634 }
27635
27636 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
27637 if (VectorBits)
27638 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
27639}
27640
27643 SelectionDAG &DAG, EVT ExtendVT,
27644 EVT CastVT) {
27645 SDLoc DL(N);
27646 SDValue Op = N->getOperand(0);
27647 EVT VT = N->getValueType(0);
27648
27649 // Use SCALAR_TO_VECTOR for lane zero
27650 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
27651 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
27652 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
27653 Results.push_back(
27654 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
27655}
27656
27657void AArch64TargetLowering::ReplaceBITCASTResults(
27659 SDLoc DL(N);
27660 SDValue Op = N->getOperand(0);
27661 EVT VT = N->getValueType(0);
27662 EVT SrcVT = Op.getValueType();
27663
27664 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
27665 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
27666 return;
27667 }
27668
27669 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
27670 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
27671 return;
27672 }
27673
27674 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
27675 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
27676 return;
27677 }
27678
27679 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
27680 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
27681 "Expected fp->int bitcast!");
27682
27683 // Bitcasting between unpacked vector types of different element counts is
27684 // not a NOP because the live elements are laid out differently.
27685 // 01234567
27686 // e.g. nxv2i32 = XX??XX??
27687 // nxv4f16 = X?X?X?X?
27688 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
27689 return;
27690
27691 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
27692 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
27693 return;
27694 }
27695
27696 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27697 !VT.isVector())
27698 return replaceBoolVectorBitcast(N, Results, DAG);
27699
27700 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
27701 return;
27702
27703 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
27704 DAG.getUNDEF(MVT::i32), Op);
27705 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
27706 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
27707}
27708
27710 SelectionDAG &DAG,
27711 const AArch64Subtarget *Subtarget) {
27712 EVT VT = N->getValueType(0);
27713 if (!VT.is256BitVector() ||
27715 !N->getFlags().hasAllowReassociation()) ||
27716 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
27717 VT.getScalarType() == MVT::bf16)
27718 return;
27719
27720 SDValue X = N->getOperand(0);
27721 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
27722 if (!Shuf) {
27723 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
27724 X = N->getOperand(1);
27725 if (!Shuf)
27726 return;
27727 }
27728
27729 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
27730 return;
27731
27732 // Check the mask is 1,0,3,2,5,4,...
27733 ArrayRef<int> Mask = Shuf->getMask();
27734 for (int I = 0, E = Mask.size(); I < E; I++)
27735 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
27736 return;
27737
27738 SDLoc DL(N);
27739 auto LoHi = DAG.SplitVector(X, DL);
27740 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
27741 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
27742 LoHi.first, LoHi.second);
27743
27744 // Shuffle the elements back into order.
27745 SmallVector<int> NMask;
27746 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
27747 NMask.push_back(I);
27748 NMask.push_back(I);
27749 }
27750 Results.push_back(
27751 DAG.getVectorShuffle(VT, DL,
27752 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
27753 DAG.getUNDEF(LoHi.first.getValueType())),
27754 DAG.getUNDEF(VT), NMask));
27755}
27756
27759 SelectionDAG &DAG, unsigned InterOp,
27760 unsigned AcrossOp) {
27761 EVT LoVT, HiVT;
27762 SDValue Lo, Hi;
27763 SDLoc DL(N);
27764 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
27765 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
27766 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
27767 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
27768 Results.push_back(SplitVal);
27769}
27770
27771void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27773 SDValue In = N->getOperand(0);
27774 EVT InVT = In.getValueType();
27775
27776 // Common code will handle these just fine.
27777 if (!InVT.isScalableVector() || !InVT.isInteger())
27778 return;
27779
27780 SDLoc DL(N);
27781 EVT VT = N->getValueType(0);
27782
27783 // The following checks bail if this is not a halving operation.
27784
27786
27787 if (InVT.getVectorElementCount() != (ResEC * 2))
27788 return;
27789
27790 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
27791 if (!CIndex)
27792 return;
27793
27794 unsigned Index = CIndex->getZExtValue();
27795 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
27796 return;
27797
27798 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27799 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27800
27801 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
27802 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
27803}
27804
27805void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
27807 assert((Subtarget->hasSVE2p1() ||
27808 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
27809 "Custom lower of get.active.lane.mask missing required feature.");
27810
27811 assert(N->getValueType(0) == MVT::nxv32i1 &&
27812 "Unexpected result type for get.active.lane.mask");
27813
27814 SDLoc DL(N);
27815 SDValue Idx = N->getOperand(0);
27816 SDValue TC = N->getOperand(1);
27817
27818 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
27819 "Unexpected operand type for get.active.lane.mask");
27820
27821 if (Idx.getValueType() != MVT::i64) {
27822 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
27823 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
27824 }
27825
27826 SDValue ID =
27827 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
27828 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
27829 auto WideMask =
27830 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
27831
27832 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
27833 {WideMask.getValue(0), WideMask.getValue(1)}));
27834}
27835
27836// Create an even/odd pair of X registers holding integer value V.
27838 SDLoc DL(V.getNode());
27839 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
27840 if (DAG.getDataLayout().isBigEndian())
27841 std::swap (VLo, VHi);
27842 SDValue RegClass =
27843 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
27844 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
27845 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
27846 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
27847 return SDValue(
27848 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
27849}
27850
27853 SelectionDAG &DAG,
27854 const AArch64Subtarget *Subtarget) {
27855 assert(N->getValueType(0) == MVT::i128 &&
27856 "AtomicCmpSwap on types less than 128 should be legal");
27857
27858 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27859 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
27860 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
27861 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
27862 SDValue Ops[] = {
27863 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
27864 createGPRPairNode(DAG, N->getOperand(3)), // Store value
27865 N->getOperand(1), // Ptr
27866 N->getOperand(0), // Chain in
27867 };
27868
27869 unsigned Opcode;
27870 switch (MemOp->getMergedOrdering()) {
27872 Opcode = AArch64::CASPX;
27873 break;
27875 Opcode = AArch64::CASPAX;
27876 break;
27878 Opcode = AArch64::CASPLX;
27879 break;
27882 Opcode = AArch64::CASPALX;
27883 break;
27884 default:
27885 llvm_unreachable("Unexpected ordering!");
27886 }
27887
27888 MachineSDNode *CmpSwap = DAG.getMachineNode(
27889 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
27890 DAG.setNodeMemRefs(CmpSwap, {MemOp});
27891
27892 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
27893 if (DAG.getDataLayout().isBigEndian())
27894 std::swap(SubReg1, SubReg2);
27895 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
27896 SDValue(CmpSwap, 0));
27897 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
27898 SDValue(CmpSwap, 0));
27899 Results.push_back(
27900 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27901 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
27902 return;
27903 }
27904
27905 unsigned Opcode;
27906 switch (MemOp->getMergedOrdering()) {
27908 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
27909 break;
27911 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
27912 break;
27914 Opcode = AArch64::CMP_SWAP_128_RELEASE;
27915 break;
27918 Opcode = AArch64::CMP_SWAP_128;
27919 break;
27920 default:
27921 llvm_unreachable("Unexpected ordering!");
27922 }
27923
27924 SDLoc DL(N);
27925 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
27926 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
27927 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
27928 New.first, New.second, N->getOperand(0)};
27929 SDNode *CmpSwap = DAG.getMachineNode(
27930 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
27931 Ops);
27932 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
27933
27934 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27935 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
27936 Results.push_back(SDValue(CmpSwap, 3));
27937}
27938
27939static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
27940 AtomicOrdering Ordering) {
27941 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
27942 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
27943 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
27944 // ATOMIC_LOAD_CLR at any point.
27945 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
27946 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
27947 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
27948 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
27949
27950 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27951 // The operand will need to be XORed in a separate step.
27952 switch (Ordering) {
27954 return AArch64::LDCLRP;
27955 break;
27957 return AArch64::LDCLRPA;
27958 break;
27960 return AArch64::LDCLRPL;
27961 break;
27964 return AArch64::LDCLRPAL;
27965 break;
27966 default:
27967 llvm_unreachable("Unexpected ordering!");
27968 }
27969 }
27970
27971 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
27972 switch (Ordering) {
27974 return AArch64::LDSETP;
27975 break;
27977 return AArch64::LDSETPA;
27978 break;
27980 return AArch64::LDSETPL;
27981 break;
27984 return AArch64::LDSETPAL;
27985 break;
27986 default:
27987 llvm_unreachable("Unexpected ordering!");
27988 }
27989 }
27990
27991 if (ISDOpcode == ISD::ATOMIC_SWAP) {
27992 switch (Ordering) {
27994 return AArch64::SWPP;
27995 break;
27997 return AArch64::SWPPA;
27998 break;
28000 return AArch64::SWPPL;
28001 break;
28004 return AArch64::SWPPAL;
28005 break;
28006 default:
28007 llvm_unreachable("Unexpected ordering!");
28008 }
28009 }
28010
28011 llvm_unreachable("Unexpected ISDOpcode!");
28012}
28013
28016 SelectionDAG &DAG,
28017 const AArch64Subtarget *Subtarget) {
28018 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
28019 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
28020 // rather than the CASP instructions, because CASP has register classes for
28021 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
28022 // to present them as single operands. LSE128 instructions use the GPR64
28023 // register class (because the pair does not have to be sequential), like
28024 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
28025
28026 assert(N->getValueType(0) == MVT::i128 &&
28027 "AtomicLoadXXX on types less than 128 should be legal");
28028
28029 if (!Subtarget->hasLSE128())
28030 return;
28031
28032 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28033 const SDValue &Chain = N->getOperand(0);
28034 const SDValue &Ptr = N->getOperand(1);
28035 const SDValue &Val128 = N->getOperand(2);
28036 std::pair<SDValue, SDValue> Val2x64 =
28037 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
28038
28039 const unsigned ISDOpcode = N->getOpcode();
28040 const unsigned MachineOpcode =
28041 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
28042
28043 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28044 SDLoc DL(Val128);
28045 Val2x64.first =
28046 DAG.getNode(ISD::XOR, DL, MVT::i64,
28047 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
28048 Val2x64.second =
28049 DAG.getNode(ISD::XOR, DL, MVT::i64,
28050 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
28051 }
28052
28053 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
28054 if (DAG.getDataLayout().isBigEndian())
28055 std::swap(Ops[0], Ops[1]);
28056
28057 MachineSDNode *AtomicInst =
28058 DAG.getMachineNode(MachineOpcode, SDLoc(N),
28059 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
28060
28061 DAG.setNodeMemRefs(AtomicInst, {MemOp});
28062
28063 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
28064 if (DAG.getDataLayout().isBigEndian())
28065 std::swap(Lo, Hi);
28066
28067 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28068 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
28069}
28070
28071void AArch64TargetLowering::ReplaceNodeResults(
28073 switch (N->getOpcode()) {
28074 default:
28075 llvm_unreachable("Don't know how to custom expand this");
28076 case ISD::BITCAST:
28077 ReplaceBITCASTResults(N, Results, DAG);
28078 return;
28079 case ISD::VECREDUCE_ADD:
28084 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
28085 return;
28087 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
28088 Results.push_back(Res);
28089 return;
28090 case ISD::ADD:
28091 case ISD::FADD:
28092 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
28093 return;
28094
28095 case ISD::CTPOP:
28096 case ISD::PARITY:
28097 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
28098 Results.push_back(Result);
28099 return;
28100 case AArch64ISD::SADDV:
28101 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
28102 return;
28103 case AArch64ISD::UADDV:
28104 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
28105 return;
28106 case AArch64ISD::SMINV:
28107 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
28108 return;
28109 case AArch64ISD::UMINV:
28110 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
28111 return;
28112 case AArch64ISD::SMAXV:
28113 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
28114 return;
28115 case AArch64ISD::UMAXV:
28116 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
28117 return;
28118 case ISD::MULHS:
28120 Results.push_back(
28121 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
28122 return;
28123 case ISD::MULHU:
28125 Results.push_back(
28126 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
28127 return;
28128 case ISD::FP_TO_UINT:
28129 case ISD::FP_TO_SINT:
28132 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
28133 // Let normal code take care of it by not adding anything to Results.
28134 return;
28136 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
28137 return;
28139 assert(N->getValueType(0) != MVT::i128 &&
28140 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
28141 break;
28144 case ISD::ATOMIC_SWAP: {
28145 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
28146 "Expected 128-bit atomicrmw.");
28147 // These need custom type legalisation so we go directly to instruction.
28148 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
28149 return;
28150 }
28151 case ISD::ADDRSPACECAST: {
28152 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
28153 Results.push_back(V);
28154 return;
28155 }
28156 case ISD::ATOMIC_LOAD:
28157 case ISD::LOAD: {
28158 MemSDNode *LoadNode = cast<MemSDNode>(N);
28159 EVT MemVT = LoadNode->getMemoryVT();
28160 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
28161 // targets.
28162 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
28163 MemVT.getSizeInBits() == 256u &&
28164 (MemVT.getScalarSizeInBits() == 8u ||
28165 MemVT.getScalarSizeInBits() == 16u ||
28166 MemVT.getScalarSizeInBits() == 32u ||
28167 MemVT.getScalarSizeInBits() == 64u)) {
28168
28169 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
28171 AArch64ISD::LDNP, SDLoc(N),
28172 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
28173 {LoadNode->getChain(), LoadNode->getBasePtr()},
28174 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28175
28176 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
28177 DAG.getBitcast(HalfVT, Result.getValue(0)),
28178 DAG.getBitcast(HalfVT, Result.getValue(1)));
28179 Results.append({Pair, Result.getValue(2) /* Chain */});
28180 return;
28181 }
28182
28183 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
28184 LoadNode->getMemoryVT() != MVT::i128) {
28185 // Non-volatile or atomic loads are optimized later in AArch64's load/store
28186 // optimizer.
28187 return;
28188 }
28189
28190 if (SDValue(N, 0).getValueType() == MVT::i128) {
28191 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
28192 bool isLoadAcquire =
28194 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
28195
28196 if (isLoadAcquire)
28197 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
28198
28200 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28201 {LoadNode->getChain(), LoadNode->getBasePtr()},
28202 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28203
28204 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
28205
28206 SDValue Pair =
28207 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
28208 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
28209 Results.append({Pair, Result.getValue(2) /* Chain */});
28210 }
28211 return;
28212 }
28214 ReplaceExtractSubVectorResults(N, Results, DAG);
28215 return;
28218 // Custom lowering has been requested for INSERT_SUBVECTOR and
28219 // CONCAT_VECTORS -- but delegate to common code for result type
28220 // legalisation
28221 return;
28223 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
28224 return;
28226 EVT VT = N->getValueType(0);
28227
28228 Intrinsic::ID IntID =
28229 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
28230 switch (IntID) {
28231 default:
28232 return;
28233 case Intrinsic::aarch64_sve_clasta_n: {
28234 assert((VT == MVT::i8 || VT == MVT::i16) &&
28235 "custom lowering for unexpected type");
28236 SDLoc DL(N);
28237 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28238 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
28239 N->getOperand(1), Op2, N->getOperand(3));
28240 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28241 return;
28242 }
28243 case Intrinsic::aarch64_sve_clastb_n: {
28244 assert((VT == MVT::i8 || VT == MVT::i16) &&
28245 "custom lowering for unexpected type");
28246 SDLoc DL(N);
28247 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28248 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
28249 N->getOperand(1), Op2, N->getOperand(3));
28250 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28251 return;
28252 }
28253 case Intrinsic::aarch64_sve_lasta: {
28254 assert((VT == MVT::i8 || VT == MVT::i16) &&
28255 "custom lowering for unexpected type");
28256 SDLoc DL(N);
28257 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
28258 N->getOperand(1), N->getOperand(2));
28259 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28260 return;
28261 }
28262 case Intrinsic::aarch64_sve_lastb: {
28263 assert((VT == MVT::i8 || VT == MVT::i16) &&
28264 "custom lowering for unexpected type");
28265 SDLoc DL(N);
28266 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
28267 N->getOperand(1), N->getOperand(2));
28268 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28269 return;
28270 }
28271 case Intrinsic::aarch64_sme_in_streaming_mode: {
28272 SDLoc DL(N);
28273 SDValue Chain = DAG.getEntryNode();
28274
28275 SDValue RuntimePStateSM =
28276 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
28277 Results.push_back(
28278 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
28279 return;
28280 }
28281 case Intrinsic::experimental_vector_match: {
28282 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
28283 return;
28284
28285 // NOTE: Only trivial type promotion is supported.
28286 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
28287 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
28288 return;
28289
28290 SDLoc DL(N);
28291 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
28292 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28293 return;
28294 }
28295 }
28296 }
28297 case ISD::READ_REGISTER: {
28298 SDLoc DL(N);
28299 assert(N->getValueType(0) == MVT::i128 &&
28300 "READ_REGISTER custom lowering is only for 128-bit sysregs");
28301 SDValue Chain = N->getOperand(0);
28302 SDValue SysRegName = N->getOperand(1);
28303
28304 SDValue Result = DAG.getNode(
28305 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28306 Chain, SysRegName);
28307
28308 // Sysregs are not endian. Result.getValue(0) always contains the lower half
28309 // of the 128-bit System Register value.
28310 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28311 Result.getValue(0), Result.getValue(1));
28312 Results.push_back(Pair);
28313 Results.push_back(Result.getValue(2)); // Chain
28314 return;
28315 }
28316 }
28317}
28318
28320 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
28322 return true;
28323}
28324
28325unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
28326 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
28327 // reciprocal if there are three or more FDIVs.
28328 return 3;
28329}
28330
28333 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
28334 // v4i16, v2i32 instead of to promote.
28335 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
28336 VT == MVT::v1f32)
28337 return TypeWidenVector;
28338
28340}
28341
28342// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
28343// provided the address is 16-byte aligned.
28345 if (!Subtarget->hasLSE2())
28346 return false;
28347
28348 if (auto LI = dyn_cast<LoadInst>(I))
28349 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28350 LI->getAlign() >= Align(16);
28351
28352 if (auto SI = dyn_cast<StoreInst>(I))
28353 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28354 SI->getAlign() >= Align(16);
28355
28356 return false;
28357}
28358
28360 if (!Subtarget->hasLSE128())
28361 return false;
28362
28363 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
28364 // will clobber the two registers.
28365 if (const auto *SI = dyn_cast<StoreInst>(I))
28366 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28367 SI->getAlign() >= Align(16) &&
28368 (SI->getOrdering() == AtomicOrdering::Release ||
28369 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
28370
28371 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
28372 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28373 RMW->getAlign() >= Align(16) &&
28374 (RMW->getOperation() == AtomicRMWInst::Xchg ||
28375 RMW->getOperation() == AtomicRMWInst::And ||
28376 RMW->getOperation() == AtomicRMWInst::Or);
28377
28378 return false;
28379}
28380
28382 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
28383 return false;
28384
28385 if (auto LI = dyn_cast<LoadInst>(I))
28386 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28387 LI->getAlign() >= Align(16) &&
28388 LI->getOrdering() == AtomicOrdering::Acquire;
28389
28390 if (auto SI = dyn_cast<StoreInst>(I))
28391 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28392 SI->getAlign() >= Align(16) &&
28393 SI->getOrdering() == AtomicOrdering::Release;
28394
28395 return false;
28396}
28397
28399 const Instruction *I) const {
28401 return false;
28403 return false;
28405 return true;
28406 return false;
28407}
28408
28410 const Instruction *I) const {
28411 // Store-Release instructions only provide seq_cst guarantees when paired with
28412 // Load-Acquire instructions. MSVC CRT does not use these instructions to
28413 // implement seq_cst loads and stores, so we need additional explicit fences
28414 // after memory writes.
28415 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28416 return false;
28417
28418 switch (I->getOpcode()) {
28419 default:
28420 return false;
28421 case Instruction::AtomicCmpXchg:
28422 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
28424 case Instruction::AtomicRMW:
28425 return cast<AtomicRMWInst>(I)->getOrdering() ==
28427 case Instruction::Store:
28428 return cast<StoreInst>(I)->getOrdering() ==
28430 }
28431}
28432
28433// Loads and stores less than 128-bits are already atomic; ones above that
28434// are doomed anyway, so defer to the default libcall and blame the OS when
28435// things go wrong.
28438 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
28439 if (Size != 128)
28441 if (isOpSuitableForRCPC3(SI))
28443 if (isOpSuitableForLSE128(SI))
28445 if (isOpSuitableForLDPSTP(SI))
28448}
28449
28450// Loads and stores less than 128-bits are already atomic; ones above that
28451// are doomed anyway, so defer to the default libcall and blame the OS when
28452// things go wrong.
28455 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
28456
28457 if (Size != 128)
28459 if (isOpSuitableForRCPC3(LI))
28461 // No LSE128 loads
28462 if (isOpSuitableForLDPSTP(LI))
28464
28465 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28466 // implement atomicrmw without spilling. If the target address is also on the
28467 // stack and close enough to the spill slot, this can lead to a situation
28468 // where the monitor always gets cleared and the atomic operation can never
28469 // succeed. So at -O0 lower this operation to a CAS loop.
28470 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28472
28473 // Using CAS for an atomic load has a better chance of succeeding under high
28474 // contention situations. So use it if available.
28475 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
28477}
28478
28479// Return true if the atomic operation expansion will lower to use a library
28480// call, and is thus ineligible to use an LLSC expansion.
28481static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
28482 const AtomicRMWInst *RMW) {
28483 if (!RMW->isFloatingPointOperation())
28484 return false;
28485 switch (RMW->getType()->getScalarType()->getTypeID()) {
28486 case Type::FloatTyID:
28487 case Type::DoubleTyID:
28488 case Type::HalfTyID:
28489 case Type::BFloatTyID:
28490 // Will use soft float
28491 return !Subtarget.hasFPARMv8();
28492 default:
28493 // fp128 will emit library calls.
28494 return true;
28495 }
28496
28497 llvm_unreachable("covered type switch");
28498}
28499
28500// The "default" for integer RMW operations is to expand to an LL/SC loop.
28501// However, with the LSE instructions (or outline-atomics mode, which provides
28502// library routines in place of the LSE-instructions), we can directly emit many
28503// operations instead.
28506 Type *Ty = AI->getType();
28507 unsigned Size = Ty->getPrimitiveSizeInBits();
28508 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
28509
28510 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
28514 if (CanUseLSE128)
28516
28517 // If LSFE available, use atomic FP instructions in preference to expansion
28518 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
28524
28525 // Nand is not supported in LSE.
28526 // Leave 128 bits to LLSC or CmpXChg.
28527 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
28528 !AI->isFloatingPointOperation()) {
28529 if (Subtarget->hasLSE())
28531 if (Subtarget->outlineAtomics()) {
28532 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
28533 // Don't outline them unless
28534 // (1) high level <atomic> support approved:
28535 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
28536 // (2) low level libgcc and compiler-rt support implemented by:
28537 // min/max outline atomics helpers
28538 if (AI->getOperation() != AtomicRMWInst::Min &&
28543 }
28544 }
28545 }
28546
28547 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28548 // implement atomicrmw without spilling. If the target address is also on the
28549 // stack and close enough to the spill slot, this can lead to a situation
28550 // where the monitor always gets cleared and the atomic operation can never
28551 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
28552 // we have a single CAS instruction that can replace the loop.
28554 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
28556
28558}
28559
28562 AtomicCmpXchgInst *AI) const {
28563 // If subtarget has LSE, leave cmpxchg intact for codegen.
28564 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
28566 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28567 // implement cmpxchg without spilling. If the address being exchanged is also
28568 // on the stack and close enough to the spill slot, this can lead to a
28569 // situation where the monitor always gets cleared and the atomic operation
28570 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
28571 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28573
28574 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
28575 // it.
28577 if (Size > 64)
28579
28581}
28582
28584 Type *ValueTy, Value *Addr,
28585 AtomicOrdering Ord) const {
28586 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28587 bool IsAcquire = isAcquireOrStronger(Ord);
28588
28589 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
28590 // intrinsic must return {i64, i64} and we have to recombine them into a
28591 // single i128 here.
28592 if (ValueTy->getPrimitiveSizeInBits() == 128) {
28594 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
28595
28596 Value *LoHi =
28597 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
28598
28599 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
28600 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
28601
28602 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
28603 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
28604 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
28605
28606 Value *Or = Builder.CreateOr(
28607 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
28608 return Builder.CreateBitCast(Or, ValueTy);
28609 }
28610
28611 Type *Tys[] = { Addr->getType() };
28613 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
28614
28615 const DataLayout &DL = M->getDataLayout();
28616 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
28617 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
28618 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
28619 Attribute::ElementType, IntEltTy));
28620 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
28621
28622 return Builder.CreateBitCast(Trunc, ValueTy);
28623}
28624
28626 IRBuilderBase &Builder) const {
28627 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
28628}
28629
28631 Value *Val, Value *Addr,
28632 AtomicOrdering Ord) const {
28633 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28634 bool IsRelease = isReleaseOrStronger(Ord);
28635
28636 // Since the intrinsics must have legal type, the i128 intrinsics take two
28637 // parameters: "i64, i64". We must marshal Val into the appropriate form
28638 // before the call.
28639 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
28641 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
28643 Type *Int64Ty = Type::getInt64Ty(M->getContext());
28644 Type *Int128Ty = Type::getInt128Ty(M->getContext());
28645
28646 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
28647
28648 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
28649 Value *Hi =
28650 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
28651 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
28652 }
28653
28655 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
28656 Type *Tys[] = { Addr->getType() };
28658
28659 const DataLayout &DL = M->getDataLayout();
28660 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
28661 Val = Builder.CreateBitCast(Val, IntValTy);
28662
28663 CallInst *CI = Builder.CreateCall(
28664 Stxr, {Builder.CreateZExtOrBitCast(
28665 Val, Stxr->getFunctionType()->getParamType(0)),
28666 Addr});
28667 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
28668 Attribute::ElementType, Val->getType()));
28669 return CI;
28670}
28671
28673 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
28674 const DataLayout &DL) const {
28675 if (!Ty->isArrayTy()) {
28676 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
28677 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
28678 }
28679
28680 // All non aggregate members of the type must have the same type
28681 SmallVector<EVT> ValueVTs;
28682 ComputeValueVTs(*this, DL, Ty, ValueVTs);
28683 return all_equal(ValueVTs);
28684}
28685
28686bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
28687 EVT) const {
28688 return false;
28689}
28690
28691static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
28692 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
28693 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
28694 M, Intrinsic::thread_pointer, IRB.getPtrTy());
28695 return IRB.CreatePointerCast(
28696 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
28697 Offset),
28698 IRB.getPtrTy(0));
28699}
28700
28702 // Android provides a fixed TLS slot for the stack cookie. See the definition
28703 // of TLS_SLOT_STACK_GUARD in
28704 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
28705 if (Subtarget->isTargetAndroid())
28706 return UseTlsOffset(IRB, 0x28);
28707
28708 // Fuchsia is similar.
28709 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
28710 if (Subtarget->isTargetFuchsia())
28711 return UseTlsOffset(IRB, -0x10);
28712
28714}
28715
28717 // MSVC CRT provides functionalities for stack protection.
28718 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
28719 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
28720
28721 RTLIB::LibcallImpl SecurityCookieVar =
28722 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
28723 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
28724 SecurityCookieVar != RTLIB::Unsupported) {
28725 // MSVC CRT has a global variable holding security cookie.
28726 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
28727 PointerType::getUnqual(M.getContext()));
28728
28729 // MSVC CRT has a function to validate security cookie.
28730 FunctionCallee SecurityCheckCookie =
28731 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
28732 Type::getVoidTy(M.getContext()),
28733 PointerType::getUnqual(M.getContext()));
28734 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
28735 F->setCallingConv(CallingConv::Win64);
28736 F->addParamAttr(0, Attribute::AttrKind::InReg);
28737 }
28738 return;
28739 }
28741}
28742
28744 // MSVC CRT has a function to validate security cookie.
28745 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
28746 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
28747 if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
28748 return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
28750}
28751
28752Value *
28754 // Android provides a fixed TLS slot for the SafeStack pointer. See the
28755 // definition of TLS_SLOT_SAFESTACK in
28756 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
28757 if (Subtarget->isTargetAndroid())
28758 return UseTlsOffset(IRB, 0x48);
28759
28760 // Fuchsia is similar.
28761 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
28762 if (Subtarget->isTargetFuchsia())
28763 return UseTlsOffset(IRB, -0x8);
28764
28766}
28767
28768/// If a physical register, this returns the register that receives the
28769/// exception address on entry to an EH pad.
28771 const Constant *PersonalityFn) const {
28772 // FIXME: This is a guess. Has this been defined yet?
28773 return AArch64::X0;
28774}
28775
28776/// If a physical register, this returns the register that receives the
28777/// exception typeid on entry to a landing pad.
28779 const Constant *PersonalityFn) const {
28780 // FIXME: This is a guess. Has this been defined yet?
28781 return AArch64::X1;
28782}
28783
28785 const Instruction &AndI) const {
28786 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
28787 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
28788 // may be beneficial to sink in other cases, but we would have to check that
28789 // the cmp would not get folded into the br to form a cbz for these to be
28790 // beneficial.
28791 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
28792 if (!Mask)
28793 return false;
28794 return Mask->getValue().isPowerOf2();
28795}
28796
28800 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
28801 SelectionDAG &DAG) const {
28802 // Does baseline recommend not to perform the fold by default?
28804 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
28805 return false;
28806 // Else, if this is a vector shift, prefer 'shl'.
28807 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
28808}
28809
28812 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
28814 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
28817 ExpansionFactor);
28818}
28819
28821 // Update IsSplitCSR in AArch64unctionInfo.
28822 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28823 AFI->setIsSplitCSR(true);
28824}
28825
28827 MachineBasicBlock *Entry,
28828 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
28829 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28830 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
28831 if (!IStart)
28832 return;
28833
28834 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28835 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28836 MachineBasicBlock::iterator MBBI = Entry->begin();
28837 for (const MCPhysReg *I = IStart; *I; ++I) {
28838 const TargetRegisterClass *RC = nullptr;
28839 if (AArch64::GPR64RegClass.contains(*I))
28840 RC = &AArch64::GPR64RegClass;
28841 else if (AArch64::FPR64RegClass.contains(*I))
28842 RC = &AArch64::FPR64RegClass;
28843 else
28844 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28845
28846 Register NewVR = MRI->createVirtualRegister(RC);
28847 // Create copy from CSR to a virtual register.
28848 // FIXME: this currently does not emit CFI pseudo-instructions, it works
28849 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28850 // nounwind. If we want to generalize this later, we may need to emit
28851 // CFI pseudo-instructions.
28852 assert(Entry->getParent()->getFunction().hasFnAttribute(
28853 Attribute::NoUnwind) &&
28854 "Function should be nounwind in insertCopiesSplitCSR!");
28855 Entry->addLiveIn(*I);
28856 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
28857 .addReg(*I);
28858
28859 // Insert the copy-back instructions right before the terminator.
28860 for (auto *Exit : Exits)
28861 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
28862 TII->get(TargetOpcode::COPY), *I)
28863 .addReg(NewVR);
28864 }
28865}
28866
28868 // Integer division on AArch64 is expensive. However, when aggressively
28869 // optimizing for code size, we prefer to use a div instruction, as it is
28870 // usually smaller than the alternative sequence.
28871 // The exception to this is vector division. Since AArch64 doesn't have vector
28872 // integer division, leaving the division as-is is a loss even in terms of
28873 // size, because it will have to be scalarized, while the alternative code
28874 // sequence can be performed in vector form.
28875 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
28876 return OptSize && !VT.isVector();
28877}
28878
28880 const MachineFunction &MF) const {
28881 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
28882 // In future, we could allow this when SVE is available, but currently,
28883 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
28884 // the general lowering may introduce stack spills/reloads).
28885 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
28886 return false;
28887
28888 // Do not merge to float value size (128 bytes) if no implicit float attribute
28889 // is set.
28890 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
28891 return !NoFloat || MemVT.getSizeInBits() <= 64;
28892}
28893
28895 // We want inc-of-add for scalars and sub-of-not for vectors.
28896 return VT.isScalarInteger();
28897}
28898
28900 EVT VT) const {
28901 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
28902 // legalize.
28903 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
28904 return false;
28905 if (FPVT == MVT::v8bf16)
28906 return false;
28907 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
28908}
28909
28911 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
28912 // avoid vselect becoming bsl / unrolling.
28913 return !VT.isFixedLengthVector();
28914}
28915
28919 const TargetInstrInfo *TII) const {
28920 assert(MBBI->isCall() && MBBI->getCFIType() &&
28921 "Invalid call instruction for a KCFI check");
28922
28923 switch (MBBI->getOpcode()) {
28924 case AArch64::BLR:
28925 case AArch64::BLRNoIP:
28926 case AArch64::TCRETURNri:
28927 case AArch64::TCRETURNrix16x17:
28928 case AArch64::TCRETURNrix17:
28929 case AArch64::TCRETURNrinotx16:
28930 break;
28931 default:
28932 llvm_unreachable("Unexpected CFI call opcode");
28933 }
28934
28935 MachineOperand &Target = MBBI->getOperand(0);
28936 assert(Target.isReg() && "Invalid target operand for an indirect call");
28937 Target.setIsRenamable(false);
28938
28939 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
28940 .addReg(Target.getReg())
28941 .addImm(MBBI->getCFIType())
28942 .getInstr();
28943}
28944
28946 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
28947}
28948
28949unsigned
28951 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
28952 return getPointerTy(DL).getSizeInBits();
28953
28954 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
28955}
28956
28957void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
28958 MachineFrameInfo &MFI = MF.getFrameInfo();
28959 // If we have any vulnerable SVE stack objects then the stack protector
28960 // needs to be placed at the top of the SVE stack area, as the SVE locals
28961 // are placed above the other locals, so we allocate it as if it were a
28962 // scalable vector.
28963 // FIXME: It may be worthwhile having a specific interface for this rather
28964 // than doing it here in finalizeLowering.
28965 if (MFI.hasStackProtectorIndex()) {
28966 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
28972 break;
28973 }
28974 }
28975 }
28978}
28979
28980// Unlike X86, we let frame lowering assign offsets to all catch objects.
28982
28983bool AArch64TargetLowering::shouldLocalize(
28984 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
28985 auto &MF = *MI.getMF();
28986 auto &MRI = MF.getRegInfo();
28987 auto maxUses = [](unsigned RematCost) {
28988 // A cost of 1 means remats are basically free.
28989 if (RematCost == 1)
28990 return std::numeric_limits<unsigned>::max();
28991 if (RematCost == 2)
28992 return 2U;
28993
28994 // Remat is too expensive, only sink if there's one user.
28995 if (RematCost > 2)
28996 return 1U;
28997 llvm_unreachable("Unexpected remat cost");
28998 };
28999
29000 unsigned Opc = MI.getOpcode();
29001 switch (Opc) {
29002 case TargetOpcode::G_GLOBAL_VALUE: {
29003 // On Darwin, TLS global vars get selected into function calls, which
29004 // we don't want localized, as they can get moved into the middle of a
29005 // another call sequence.
29006 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
29007 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
29008 return false;
29009 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
29010 }
29011 case TargetOpcode::G_FCONSTANT:
29012 case TargetOpcode::G_CONSTANT: {
29013 const ConstantInt *CI;
29014 unsigned AdditionalCost = 0;
29015
29016 if (Opc == TargetOpcode::G_CONSTANT)
29017 CI = MI.getOperand(1).getCImm();
29018 else {
29019 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29020 // We try to estimate cost of 32/64b fpimms, as they'll likely be
29021 // materialized as integers.
29022 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
29023 break;
29024 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
29025 bool OptForSize = MF.getFunction().hasOptSize();
29027 OptForSize))
29028 return true; // Constant should be cheap.
29029 CI =
29030 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
29031 // FP materialization also costs an extra move, from gpr to fpr.
29032 AdditionalCost = 1;
29033 }
29034 APInt Imm = CI->getValue();
29037 assert(Cost.isValid() && "Expected a valid imm cost");
29038
29039 unsigned RematCost = Cost.getValue();
29040 RematCost += AdditionalCost;
29041 Register Reg = MI.getOperand(0).getReg();
29042 unsigned MaxUses = maxUses(RematCost);
29043 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
29044 if (MaxUses == std::numeric_limits<unsigned>::max())
29045 --MaxUses;
29046 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
29047 }
29048 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
29049 // localizable.
29050 case AArch64::ADRP:
29051 case AArch64::G_ADD_LOW:
29052 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
29053 case TargetOpcode::G_PTR_ADD:
29054 return true;
29055 default:
29056 break;
29057 }
29059}
29060
29062 // Fallback for scalable vectors.
29063 // Note that if EnableSVEGISel is true, we allow scalable vector types for
29064 // all instructions, regardless of whether they are actually supported.
29065 if (!EnableSVEGISel) {
29066 if (Inst.getType()->isScalableTy()) {
29067 return true;
29068 }
29069
29070 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
29071 if (Inst.getOperand(i)->getType()->isScalableTy())
29072 return true;
29073
29074 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
29075 if (AI->getAllocatedType()->isScalableTy())
29076 return true;
29077 }
29078 }
29079
29080 // Checks to allow the use of SME instructions
29081 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
29082 auto CallAttrs = SMECallAttrs(*Base, this);
29083 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
29084 CallAttrs.requiresPreservingZT0() ||
29085 CallAttrs.requiresPreservingAllZAState())
29086 return true;
29087 }
29088 return false;
29089}
29090
29091// Return the largest legal scalable vector type that matches VT's element type.
29095 "Expected legal fixed length vector!");
29096 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29097 default:
29098 llvm_unreachable("unexpected element type for SVE container");
29099 case MVT::i8:
29100 return EVT(MVT::nxv16i8);
29101 case MVT::i16:
29102 return EVT(MVT::nxv8i16);
29103 case MVT::i32:
29104 return EVT(MVT::nxv4i32);
29105 case MVT::i64:
29106 return EVT(MVT::nxv2i64);
29107 case MVT::bf16:
29108 return EVT(MVT::nxv8bf16);
29109 case MVT::f16:
29110 return EVT(MVT::nxv8f16);
29111 case MVT::f32:
29112 return EVT(MVT::nxv4f32);
29113 case MVT::f64:
29114 return EVT(MVT::nxv2f64);
29115 }
29116}
29117
29118// Return a predicate with active lanes corresponding to the extent of VT.
29120 EVT VT) {
29123 "Expected legal fixed length vector!");
29124
29125 std::optional<unsigned> PgPattern =
29127 assert(PgPattern && "Unexpected element count for SVE predicate");
29128
29129 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
29130 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
29131 // variants of instructions when available.
29132 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29133 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29134 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29135 if (MaxSVESize && MinSVESize == MaxSVESize &&
29136 MaxSVESize == VT.getSizeInBits())
29137 PgPattern = AArch64SVEPredPattern::all;
29138
29139 MVT MaskVT;
29140 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29141 default:
29142 llvm_unreachable("unexpected element type for SVE predicate");
29143 case MVT::i8:
29144 MaskVT = MVT::nxv16i1;
29145 break;
29146 case MVT::i16:
29147 case MVT::f16:
29148 case MVT::bf16:
29149 MaskVT = MVT::nxv8i1;
29150 break;
29151 case MVT::i32:
29152 case MVT::f32:
29153 MaskVT = MVT::nxv4i1;
29154 break;
29155 case MVT::i64:
29156 case MVT::f64:
29157 MaskVT = MVT::nxv2i1;
29158 break;
29159 }
29160
29161 return getPTrue(DAG, DL, MaskVT, *PgPattern);
29162}
29163
29165 EVT VT) {
29167 "Expected legal scalable vector!");
29168 auto PredTy = VT.changeVectorElementType(MVT::i1);
29169 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
29170}
29171
29173 if (VT.isFixedLengthVector())
29174 return getPredicateForFixedLengthVector(DAG, DL, VT);
29175
29176 return getPredicateForScalableVector(DAG, DL, VT);
29177}
29178
29179// Grow V to consume an entire SVE register.
29181 assert(VT.isScalableVector() &&
29182 "Expected to convert into a scalable vector!");
29183 assert(V.getValueType().isFixedLengthVector() &&
29184 "Expected a fixed length vector operand!");
29185 SDLoc DL(V);
29186 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29187 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
29188}
29189
29190// Shrink V so it's just big enough to maintain a VT's worth of data.
29193 "Expected to convert into a fixed length vector!");
29194 assert(V.getValueType().isScalableVector() &&
29195 "Expected a scalable vector operand!");
29196 SDLoc DL(V);
29197 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29198 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
29199}
29200
29201// Convert all fixed length vector loads larger than NEON to masked_loads.
29202SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
29203 SDValue Op, SelectionDAG &DAG) const {
29204 auto Load = cast<LoadSDNode>(Op);
29205
29206 SDLoc DL(Op);
29207 EVT VT = Op.getValueType();
29208 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29209 EVT LoadVT = ContainerVT;
29210 EVT MemVT = Load->getMemoryVT();
29211
29212 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29213
29214 if (VT.isFloatingPoint()) {
29215 LoadVT = ContainerVT.changeTypeToInteger();
29216 MemVT = MemVT.changeTypeToInteger();
29217 }
29218
29219 SDValue NewLoad = DAG.getMaskedLoad(
29220 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
29221 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
29222 Load->getAddressingMode(), Load->getExtensionType());
29223
29224 SDValue Result = NewLoad;
29225 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
29226 EVT ExtendVT = ContainerVT.changeVectorElementType(
29227 Load->getMemoryVT().getVectorElementType());
29228
29229 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
29230 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29231 Pg, Result, DAG.getUNDEF(ContainerVT));
29232 } else if (VT.isFloatingPoint()) {
29233 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
29234 }
29235
29236 Result = convertFromScalableVector(DAG, VT, Result);
29237 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29238 return DAG.getMergeValues(MergedValues, DL);
29239}
29240
29242 SelectionDAG &DAG) {
29243 SDLoc DL(Mask);
29244 EVT InVT = Mask.getValueType();
29245 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29247
29248 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
29249 return Pg;
29250
29251 bool InvertCond = false;
29252 if (isBitwiseNot(Mask)) {
29253 InvertCond = true;
29254 Mask = Mask.getOperand(0);
29255 }
29256
29257 SDValue Op1, Op2;
29258 ISD::CondCode CC;
29259
29260 // When Mask is the result of a SETCC, it's better to regenerate the compare.
29261 if (Mask.getOpcode() == ISD::SETCC) {
29262 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
29263 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
29264 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
29265 } else {
29266 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
29267 Op2 = DAG.getConstant(0, DL, ContainerVT);
29268 CC = ISD::SETNE;
29269 }
29270
29271 if (InvertCond)
29272 CC = getSetCCInverse(CC, Op1.getValueType());
29273
29274 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
29275 {Pg, Op1, Op2, DAG.getCondCode(CC)});
29276}
29277
29278// Convert all fixed length vector loads larger than NEON to masked_loads.
29279SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
29280 SDValue Op, SelectionDAG &DAG) const {
29281 auto Load = cast<MaskedLoadSDNode>(Op);
29282
29283 SDLoc DL(Op);
29284 EVT VT = Op.getValueType();
29285 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29286
29287 SDValue Mask = Load->getMask();
29288 // If this is an extending load and the mask type is not the same as
29289 // load's type then we have to extend the mask type.
29290 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
29291 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
29292 "Incorrect mask type");
29293 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
29294 }
29296
29297 SDValue PassThru;
29298 bool IsPassThruZeroOrUndef = false;
29299
29300 if (Load->getPassThru()->isUndef()) {
29301 PassThru = DAG.getUNDEF(ContainerVT);
29302 IsPassThruZeroOrUndef = true;
29303 } else {
29304 if (ContainerVT.isInteger())
29305 PassThru = DAG.getConstant(0, DL, ContainerVT);
29306 else
29307 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
29308 if (isZerosVector(Load->getPassThru().getNode()))
29309 IsPassThruZeroOrUndef = true;
29310 }
29311
29312 SDValue NewLoad = DAG.getMaskedLoad(
29313 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
29314 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
29315 Load->getAddressingMode(), Load->getExtensionType());
29316
29317 SDValue Result = NewLoad;
29318 if (!IsPassThruZeroOrUndef) {
29319 SDValue OldPassThru =
29320 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
29321 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
29322 }
29323
29324 Result = convertFromScalableVector(DAG, VT, Result);
29325 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29326 return DAG.getMergeValues(MergedValues, DL);
29327}
29328
29329// Convert all fixed length vector stores larger than NEON to masked_stores.
29330SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
29331 SDValue Op, SelectionDAG &DAG) const {
29332 auto Store = cast<StoreSDNode>(Op);
29333
29334 SDLoc DL(Op);
29335 EVT VT = Store->getValue().getValueType();
29336 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29337 EVT MemVT = Store->getMemoryVT();
29338
29339 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29340 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29341
29342 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
29343 EVT TruncVT = ContainerVT.changeVectorElementType(
29344 Store->getMemoryVT().getVectorElementType());
29345 MemVT = MemVT.changeTypeToInteger();
29346 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
29347 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
29348 DAG.getUNDEF(TruncVT));
29349 NewValue =
29350 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29351 } else if (VT.isFloatingPoint()) {
29352 MemVT = MemVT.changeTypeToInteger();
29353 NewValue =
29354 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29355 }
29356
29357 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
29358 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
29359 Store->getMemOperand(), Store->getAddressingMode(),
29360 Store->isTruncatingStore());
29361}
29362
29363SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
29364 SDValue Op, SelectionDAG &DAG) const {
29365 auto *Store = cast<MaskedStoreSDNode>(Op);
29366
29367 SDLoc DL(Op);
29368 EVT VT = Store->getValue().getValueType();
29369 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29370
29371 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29373
29374 return DAG.getMaskedStore(
29375 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
29376 Mask, Store->getMemoryVT(), Store->getMemOperand(),
29377 Store->getAddressingMode(), Store->isTruncatingStore());
29378}
29379
29380SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
29381 SDValue Op, SelectionDAG &DAG) const {
29382 SDLoc DL(Op);
29383 EVT VT = Op.getValueType();
29384 EVT EltVT = VT.getVectorElementType();
29385
29386 bool Signed = Op.getOpcode() == ISD::SDIV;
29387 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
29388
29389 bool Negated;
29390 uint64_t SplatVal;
29391 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
29392 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29393 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29394 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
29395
29397 SDValue Res =
29398 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
29399 if (Negated)
29400 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
29401 DAG.getConstant(0, DL, ContainerVT), Res);
29402
29403 return convertFromScalableVector(DAG, VT, Res);
29404 }
29405
29406 // Scalable vector i32/i64 DIV is supported.
29407 if (EltVT == MVT::i32 || EltVT == MVT::i64)
29408 return LowerToPredicatedOp(Op, DAG, PredOpcode);
29409
29410 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
29411 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
29412 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
29413 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29414
29415 // If the wider type is legal: extend, op, and truncate.
29416 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
29417 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
29418 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
29419 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
29420 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
29421 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
29422 }
29423
29424 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
29425 &ExtendOpcode](SDValue Op) {
29426 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
29427 SDValue IdxHalf =
29428 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
29429 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
29430 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
29431 return std::pair<SDValue, SDValue>(
29432 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
29433 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
29434 };
29435
29436 // If wider type is not legal: split, extend, op, trunc and concat.
29437 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
29438 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
29439 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
29440 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
29441 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
29442 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
29443 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
29444}
29445
29446SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
29447 SDValue Op, SelectionDAG &DAG) const {
29448 EVT VT = Op.getValueType();
29449 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29450
29451 SDLoc DL(Op);
29452 SDValue Val = Op.getOperand(0);
29453 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29454 Val = convertToScalableVector(DAG, ContainerVT, Val);
29455
29456 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
29457 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
29458
29459 // Repeatedly unpack Val until the result is of the desired element type.
29460 switch (ContainerVT.getSimpleVT().SimpleTy) {
29461 default:
29462 llvm_unreachable("unimplemented container type");
29463 case MVT::nxv16i8:
29464 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
29465 if (VT.getVectorElementType() == MVT::i16)
29466 break;
29467 [[fallthrough]];
29468 case MVT::nxv8i16:
29469 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
29470 if (VT.getVectorElementType() == MVT::i32)
29471 break;
29472 [[fallthrough]];
29473 case MVT::nxv4i32:
29474 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
29475 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
29476 break;
29477 }
29478
29479 return convertFromScalableVector(DAG, VT, Val);
29480}
29481
29482SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
29483 SDValue Op, SelectionDAG &DAG) const {
29484 EVT VT = Op.getValueType();
29485 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29486
29487 SDLoc DL(Op);
29488 SDValue Val = Op.getOperand(0);
29489 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29490 Val = convertToScalableVector(DAG, ContainerVT, Val);
29491
29492 // Repeatedly truncate Val until the result is of the desired element type.
29493 switch (ContainerVT.getSimpleVT().SimpleTy) {
29494 default:
29495 llvm_unreachable("unimplemented container type");
29496 case MVT::nxv2i64:
29497 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
29498 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
29499 if (VT.getVectorElementType() == MVT::i32)
29500 break;
29501 [[fallthrough]];
29502 case MVT::nxv4i32:
29503 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
29504 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
29505 if (VT.getVectorElementType() == MVT::i16)
29506 break;
29507 [[fallthrough]];
29508 case MVT::nxv8i16:
29509 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
29510 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
29511 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
29512 break;
29513 }
29514
29515 return convertFromScalableVector(DAG, VT, Val);
29516}
29517
29518SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
29519 SDValue Op, SelectionDAG &DAG) const {
29520 EVT VT = Op.getValueType();
29521 EVT InVT = Op.getOperand(0).getValueType();
29522 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
29523
29524 SDLoc DL(Op);
29525 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29526 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
29527
29528 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
29529}
29530
29531SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
29532 SDValue Op, SelectionDAG &DAG) const {
29533 EVT VT = Op.getValueType();
29534 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29535
29536 SDLoc DL(Op);
29537 EVT InVT = Op.getOperand(0).getValueType();
29538 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29539 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
29540
29541 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
29542 Op.getOperand(1), Op.getOperand(2));
29543
29544 return convertFromScalableVector(DAG, VT, ScalableRes);
29545}
29546
29547// Convert vector operation 'Op' to an equivalent predicated operation whereby
29548// the original operation's type is used to construct a suitable predicate.
29549// NOTE: The results for inactive lanes are undefined.
29550SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
29551 SelectionDAG &DAG,
29552 unsigned NewOp) const {
29553 EVT VT = Op.getValueType();
29554 SDLoc DL(Op);
29555 auto Pg = getPredicateForVector(DAG, DL, VT);
29556
29557 if (VT.isFixedLengthVector()) {
29558 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
29559 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29560
29561 // Create list of operands by converting existing ones to scalable types.
29563 for (const SDValue &V : Op->op_values()) {
29564 if (isa<CondCodeSDNode>(V)) {
29565 Operands.push_back(V);
29566 continue;
29567 }
29568
29569 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
29570 EVT VTArg = VTNode->getVT().getVectorElementType();
29571 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
29572 Operands.push_back(DAG.getValueType(NewVTArg));
29573 continue;
29574 }
29575
29576 assert(isTypeLegal(V.getValueType()) &&
29577 "Expected only legal fixed-width types");
29578 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
29579 }
29580
29581 if (isMergePassthruOpcode(NewOp))
29582 Operands.push_back(DAG.getUNDEF(ContainerVT));
29583
29584 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
29585 return convertFromScalableVector(DAG, VT, ScalableRes);
29586 }
29587
29588 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
29589
29591 for (const SDValue &V : Op->op_values()) {
29592 assert((!V.getValueType().isVector() ||
29593 V.getValueType().isScalableVector()) &&
29594 "Only scalable vectors are supported!");
29595 Operands.push_back(V);
29596 }
29597
29598 if (isMergePassthruOpcode(NewOp))
29599 Operands.push_back(DAG.getUNDEF(VT));
29600
29601 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
29602}
29603
29604// If a fixed length vector operation has no side effects when applied to
29605// undefined elements, we can safely use scalable vectors to perform the same
29606// operation without needing to worry about predication.
29607SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
29608 SelectionDAG &DAG) const {
29609 EVT VT = Op.getValueType();
29611 "Only expected to lower fixed length vector operation!");
29612 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29613
29614 // Create list of operands by converting existing ones to scalable types.
29616 for (const SDValue &V : Op->op_values()) {
29617 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
29618
29619 // Pass through non-vector operands.
29620 if (!V.getValueType().isVector()) {
29621 Ops.push_back(V);
29622 continue;
29623 }
29624
29625 // "cast" fixed length vector to a scalable vector.
29626 assert(V.getValueType().isFixedLengthVector() &&
29627 isTypeLegal(V.getValueType()) &&
29628 "Only fixed length vectors are supported!");
29629 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
29630 }
29631
29632 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
29633 return convertFromScalableVector(DAG, VT, ScalableRes);
29634}
29635
29636SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
29637 SelectionDAG &DAG) const {
29638 SDLoc DL(ScalarOp);
29639 SDValue AccOp = ScalarOp.getOperand(0);
29640 SDValue VecOp = ScalarOp.getOperand(1);
29641 EVT SrcVT = VecOp.getValueType();
29642 EVT ResVT = SrcVT.getVectorElementType();
29643
29644 EVT ContainerVT = SrcVT;
29645 if (SrcVT.isFixedLengthVector()) {
29646 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
29647 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
29648 }
29649
29650 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
29651 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29652
29653 // Convert operands to Scalable.
29654 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
29655 DAG.getUNDEF(ContainerVT), AccOp, Zero);
29656
29657 // Perform reduction.
29658 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
29659 Pg, AccOp, VecOp);
29660
29661 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
29662}
29663
29664SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
29665 SelectionDAG &DAG) const {
29666 SDLoc DL(ReduceOp);
29667 SDValue Op = ReduceOp.getOperand(0);
29668 EVT OpVT = Op.getValueType();
29669 EVT VT = ReduceOp.getValueType();
29670
29671 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
29672 return SDValue();
29673
29674 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
29675
29676 switch (ReduceOp.getOpcode()) {
29677 default:
29678 return SDValue();
29679 case ISD::VECREDUCE_OR:
29680 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
29681 // The predicate can be 'Op' because
29682 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
29683 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
29684 else
29685 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
29686 case ISD::VECREDUCE_AND: {
29687 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
29688 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
29689 }
29690 case ISD::VECREDUCE_XOR: {
29691 SDValue ID =
29692 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
29693 if (OpVT == MVT::nxv1i1) {
29694 // Emulate a CNTP on .Q using .D and a different governing predicate.
29695 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
29696 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
29697 }
29698 SDValue Cntp =
29699 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
29700 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
29701 }
29702 }
29703
29704 return SDValue();
29705}
29706
29707SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
29708 SDValue ScalarOp,
29709 SelectionDAG &DAG) const {
29710 SDLoc DL(ScalarOp);
29711 SDValue VecOp = ScalarOp.getOperand(0);
29712 EVT SrcVT = VecOp.getValueType();
29713
29715 SrcVT,
29716 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
29717 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
29718 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
29719 }
29720
29721 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
29722 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
29723 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
29724 SDValue BoolVec = VecOp.getOperand(0);
29725 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
29726 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
29727 SDValue CntpOp = DAG.getNode(
29728 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
29729 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
29730 BoolVec, BoolVec);
29731 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
29732 }
29733 }
29734
29735 // UADDV always returns an i64 result.
29736 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
29737 SrcVT.getVectorElementType();
29738 EVT RdxVT = SrcVT;
29739 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
29740 RdxVT = getPackedSVEVectorVT(ResVT);
29741
29742 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
29743 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
29744 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
29745 Rdx, DAG.getConstant(0, DL, MVT::i64));
29746
29747 // The VEC_REDUCE nodes expect an element size result.
29748 if (ResVT != ScalarOp.getValueType())
29749 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
29750
29751 return Res;
29752}
29753
29754SDValue
29755AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
29756 SelectionDAG &DAG) const {
29757 EVT VT = Op.getValueType();
29758 SDLoc DL(Op);
29759
29760 EVT InVT = Op.getOperand(1).getValueType();
29761 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29762 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
29763 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
29764
29765 // Convert the mask to a predicated (NOTE: We don't need to worry about
29766 // inactive lanes since VSELECT is safe when given undefined elements).
29767 EVT MaskVT = Op.getOperand(0).getValueType();
29768 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
29769 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
29771 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
29772
29773 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
29774 Mask, Op1, Op2);
29775
29776 return convertFromScalableVector(DAG, VT, ScalableRes);
29777}
29778
29779SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
29780 SDValue Op, SelectionDAG &DAG) const {
29781 SDLoc DL(Op);
29782 EVT InVT = Op.getOperand(0).getValueType();
29783 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29784
29785 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
29786 "Only expected to lower fixed length vector operation!");
29787 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
29788 "Expected integer result of the same bit length as the inputs!");
29789
29790 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29791 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
29792 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
29793
29794 EVT CmpVT = Pg.getValueType();
29795 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
29796 {Pg, Op1, Op2, Op.getOperand(2)});
29797
29798 EVT PromoteVT = ContainerVT.changeTypeToInteger();
29799 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
29800 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
29801}
29802
29803SDValue
29804AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
29805 SelectionDAG &DAG) const {
29806 SDLoc DL(Op);
29807 auto SrcOp = Op.getOperand(0);
29808 EVT VT = Op.getValueType();
29809 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29810 EVT ContainerSrcVT =
29811 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
29812
29813 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
29814 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
29815 return convertFromScalableVector(DAG, VT, Op);
29816}
29817
29818SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
29819 SDValue Op, SelectionDAG &DAG) const {
29820 SDLoc DL(Op);
29821 unsigned NumOperands = Op->getNumOperands();
29822
29823 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
29824 "Unexpected number of operands in CONCAT_VECTORS");
29825
29826 auto SrcOp1 = Op.getOperand(0);
29827 auto SrcOp2 = Op.getOperand(1);
29828 EVT VT = Op.getValueType();
29829 EVT SrcVT = SrcOp1.getValueType();
29830
29831 // Match a splat of 128b segments that fit in a single register.
29832 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
29833 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29834 SDValue Splat =
29835 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
29836 convertToScalableVector(DAG, ContainerVT, SrcOp1),
29837 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
29838 return convertFromScalableVector(DAG, VT, Splat);
29839 }
29840
29841 if (NumOperands > 2) {
29843 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
29844 for (unsigned I = 0; I < NumOperands; I += 2)
29845 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
29846 Op->getOperand(I), Op->getOperand(I + 1)));
29847
29848 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
29849 }
29850
29851 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29852
29854 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
29855 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
29856
29857 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
29858
29859 return convertFromScalableVector(DAG, VT, Op);
29860}
29861
29862SDValue
29863AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
29864 SelectionDAG &DAG) const {
29865 EVT VT = Op.getValueType();
29866 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29867
29868 SDLoc DL(Op);
29869 SDValue Val = Op.getOperand(0);
29870 SDValue Pg = getPredicateForVector(DAG, DL, VT);
29871 EVT SrcVT = Val.getValueType();
29872 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29873 EVT ExtendVT = ContainerVT.changeVectorElementType(
29874 SrcVT.getVectorElementType());
29875
29876 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29877 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
29878
29879 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
29880 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
29881 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29882 Pg, Val, DAG.getUNDEF(ContainerVT));
29883
29884 return convertFromScalableVector(DAG, VT, Val);
29885}
29886
29887SDValue
29888AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
29889 SelectionDAG &DAG) const {
29890 EVT VT = Op.getValueType();
29891 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29892
29893 SDLoc DL(Op);
29894 SDValue Val = Op.getOperand(0);
29895 EVT SrcVT = Val.getValueType();
29896 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29897 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
29899 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
29900
29901 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29902 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
29903 Op.getOperand(1), DAG.getUNDEF(RoundVT));
29904 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
29905 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29906
29907 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29908 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29909}
29910
29911SDValue
29912AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
29913 SelectionDAG &DAG) const {
29914 EVT VT = Op.getValueType();
29915 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29916
29917 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
29918 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
29919 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
29920
29921 SDLoc DL(Op);
29922 SDValue Val = Op.getOperand(0);
29923 EVT SrcVT = Val.getValueType();
29924 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29925 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29926
29927 if (VT.bitsGE(SrcVT)) {
29929
29930 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
29931 VT.changeTypeToInteger(), Val);
29932
29933 // Safe to use a larger than specified operand because by promoting the
29934 // value nothing has changed from an arithmetic point of view.
29935 Val =
29936 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
29937 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29938 DAG.getUNDEF(ContainerDstVT));
29939 return convertFromScalableVector(DAG, VT, Val);
29940 } else {
29941 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
29942 ContainerDstVT.getVectorElementType());
29944
29945 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29946 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29947 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
29948 Val = convertFromScalableVector(DAG, SrcVT, Val);
29949
29950 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29951 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29952 }
29953}
29954
29955SDValue
29956AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
29957 SelectionDAG &DAG) const {
29958 SDLoc DL(Op);
29959 EVT OpVT = Op.getValueType();
29960 assert(OpVT.isScalableVector() &&
29961 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
29962
29963 // Are multi-register uzp instructions available?
29964 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
29965 OpVT.getVectorElementType() != MVT::i1) {
29966 Intrinsic::ID IntID;
29967 switch (Op->getNumOperands()) {
29968 default:
29969 return SDValue();
29970 case 2:
29971 IntID = Intrinsic::aarch64_sve_uzp_x2;
29972 break;
29973 case 4:
29974 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
29975 OpVT.getScalarSizeInBits() == 64)
29976 return SDValue();
29977 IntID = Intrinsic::aarch64_sve_uzp_x4;
29978 break;
29979 }
29980
29982 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
29983 Ops.append(Op->op_values().begin(), Op->op_values().end());
29984 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
29985 }
29986
29987 if (Op->getNumOperands() != 2)
29988 return SDValue();
29989
29990 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
29991 Op.getOperand(1));
29992 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
29993 Op.getOperand(1));
29994 return DAG.getMergeValues({Even, Odd}, DL);
29995}
29996
29997SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
29998 SelectionDAG &DAG) const {
29999 SDLoc DL(Op);
30000 EVT OpVT = Op.getValueType();
30001 assert(OpVT.isScalableVector() &&
30002 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
30003
30004 // Are multi-register zip instructions available?
30005 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30006 OpVT.getVectorElementType() != MVT::i1) {
30007 Intrinsic::ID IntID;
30008 switch (Op->getNumOperands()) {
30009 default:
30010 return SDValue();
30011 case 2:
30012 IntID = Intrinsic::aarch64_sve_zip_x2;
30013 break;
30014 case 4:
30015 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30016 OpVT.getScalarSizeInBits() == 64)
30017 return SDValue();
30018 IntID = Intrinsic::aarch64_sve_zip_x4;
30019 break;
30020 }
30021
30023 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30024 Ops.append(Op->op_values().begin(), Op->op_values().end());
30025 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30026 }
30027
30028 if (Op->getNumOperands() != 2)
30029 return SDValue();
30030
30031 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
30032 Op.getOperand(1));
30033 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
30034 Op.getOperand(1));
30035 return DAG.getMergeValues({Lo, Hi}, DL);
30036}
30037
30038SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
30039 SelectionDAG &DAG) const {
30040 // FIXME: Maybe share some code with LowerMGather/Scatter?
30041 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
30042 SDLoc DL(HG);
30043 SDValue Chain = HG->getChain();
30044 SDValue Inc = HG->getInc();
30045 SDValue Mask = HG->getMask();
30046 SDValue Ptr = HG->getBasePtr();
30047 SDValue Index = HG->getIndex();
30048 SDValue Scale = HG->getScale();
30049 SDValue IntID = HG->getIntID();
30050
30051 // The Intrinsic ID determines the type of update operation.
30052 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
30053 // Right now, we only support 'add' as an update.
30054 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
30055 "Unexpected histogram update operation");
30056
30057 EVT IndexVT = Index.getValueType();
30058 LLVMContext &Ctx = *DAG.getContext();
30060 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
30061 EVT IncExtVT =
30062 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
30063 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
30064 bool ExtTrunc = IncSplatVT != MemVT;
30065
30066 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30067 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
30068 SDValue IncSplat = DAG.getSplatVector(
30069 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
30070 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
30071
30072 MachineMemOperand *MMO = HG->getMemOperand();
30073 // Create an MMO for the gather, without load|store flags.
30076 MMO->getAlign(), MMO->getAAInfo());
30077 ISD::MemIndexType IndexType = HG->getIndexType();
30078 SDValue Gather = DAG.getMaskedGather(
30079 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
30080 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
30081
30082 SDValue GChain = Gather.getValue(1);
30083
30084 // Perform the histcnt, multiply by inc, add to bucket data.
30085 SDValue ID =
30086 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
30087 SDValue HistCnt =
30088 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
30089 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
30090 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
30091
30092 // Create an MMO for the scatter, without load|store flags.
30095 MMO->getAlign(), MMO->getAAInfo());
30096
30097 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
30098 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
30099 ScatterOps, SMMO, IndexType, ExtTrunc);
30100 return Scatter;
30101}
30102
30103/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
30104/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
30105/// however still make use of the dot product instruction by instead
30106/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
30107/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
30108/// the following pattern is emitted:
30109/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
30110/// NTy/2))))
30111SDValue
30112AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
30113 SelectionDAG &DAG) const {
30114 SDLoc DL(Op);
30115
30116 SDValue Acc = Op.getOperand(0);
30117 SDValue LHS = Op.getOperand(1);
30118 SDValue RHS = Op.getOperand(2);
30119 EVT ResultVT = Op.getValueType();
30120 EVT OrigResultVT = ResultVT;
30121 EVT OpVT = LHS.getValueType();
30122
30123 bool ConvertToScalable =
30124 ResultVT.isFixedLengthVector() &&
30125 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
30126
30127 if (ConvertToScalable) {
30128 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
30129 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
30130 Acc = convertToScalableVector(DAG, ResultVT, Acc);
30131 LHS = convertToScalableVector(DAG, OpVT, LHS);
30132 RHS = convertToScalableVector(DAG, OpVT, RHS);
30133 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
30134 }
30135
30136 // Two-way and four-way partial reductions are supported by patterns.
30137 // We only need to handle the 8-way partial reduction.
30138 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
30139 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
30140 : Op;
30141
30142 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
30143 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
30144 DAG.getConstant(0, DL, DotVT), LHS, RHS);
30145
30146 SDValue Res;
30147 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
30148 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
30149 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
30150 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
30151 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
30152 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
30153 } else {
30154 // Fold (nx)v4i32 into (nx)v2i64
30155 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
30156 if (IsUnsigned) {
30157 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
30158 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
30159 } else {
30160 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
30161 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
30162 }
30163 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
30164 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
30165 }
30166
30167 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
30168 : Res;
30169}
30170
30171SDValue
30172AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
30173 SelectionDAG &DAG) const {
30174 EVT VT = Op.getValueType();
30175 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30176
30177 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
30178 "Lowering fixed length get_active_lane_mask requires SVE!");
30179
30180 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
30181 // but we can use SVE when available.
30182
30183 SDLoc DL(Op);
30184 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30185 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
30186
30188 Op.getOperand(0), Op.getOperand(1));
30189 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
30190 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
30191 DAG.getVectorIdxConstant(0, DL));
30192}
30193
30194SDValue
30195AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
30196 SelectionDAG &DAG) const {
30197 EVT VT = Op.getValueType();
30198 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30199
30200 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
30201 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
30202 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
30203
30204 SDLoc DL(Op);
30205 SDValue Val = Op.getOperand(0);
30206 EVT SrcVT = Val.getValueType();
30207 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30208 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30209
30210 if (VT.bitsGT(SrcVT)) {
30211 EVT CvtVT = ContainerDstVT.changeVectorElementType(
30212 ContainerSrcVT.getVectorElementType());
30214
30215 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30216 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
30217
30218 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
30219 Val = getSVESafeBitCast(CvtVT, Val, DAG);
30220 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30221 DAG.getUNDEF(ContainerDstVT));
30222 return convertFromScalableVector(DAG, VT, Val);
30223 } else {
30224 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
30226
30227 // Safe to use a larger than specified result since an fp_to_int where the
30228 // result doesn't fit into the destination is undefined.
30229 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30230 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30231 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30232
30233 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
30234 }
30235}
30236
30238 ArrayRef<int> ShuffleMask, EVT VT,
30239 EVT ContainerVT, SelectionDAG &DAG) {
30240 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30241 SDLoc DL(Op);
30242 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30243 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30244 bool IsSingleOp =
30245 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
30246
30247 if (!Subtarget.isNeonAvailable() && !MinSVESize)
30248 MinSVESize = 128;
30249
30250 // Ignore two operands if no SVE2 or all index numbers couldn't
30251 // be represented.
30252 if (!IsSingleOp && !Subtarget.hasSVE2())
30253 return SDValue();
30254
30255 EVT VTOp1 = Op.getOperand(0).getValueType();
30256 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
30257 unsigned IndexLen = MinSVESize / BitsPerElt;
30258 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
30259 uint64_t MaxOffset = maxUIntN(BitsPerElt);
30260 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
30261 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
30262 bool MinMaxEqual = (MinSVESize == MaxSVESize);
30263 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
30264 "Incorrectly legalised shuffle operation");
30265
30267 // If MinSVESize is not equal to MaxSVESize then we need to know which
30268 // TBL mask element needs adjustment.
30269 SmallVector<SDValue, 8> AddRuntimeVLMask;
30270
30271 // Bail out for 8-bits element types, because with 2048-bit SVE register
30272 // size 8 bits is only sufficient to index into the first source vector.
30273 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
30274 return SDValue();
30275
30276 for (int Index : ShuffleMask) {
30277 // Handling poison index value.
30278 if (Index < 0)
30279 Index = 0;
30280 // If the mask refers to elements in the second operand, then we have to
30281 // offset the index by the number of elements in a vector. If this is number
30282 // is not known at compile-time, we need to maintain a mask with 'VL' values
30283 // to add at runtime.
30284 if ((unsigned)Index >= ElementsPerVectorReg) {
30285 if (MinMaxEqual) {
30286 Index += IndexLen - ElementsPerVectorReg;
30287 } else {
30288 Index = Index - ElementsPerVectorReg;
30289 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
30290 }
30291 } else if (!MinMaxEqual)
30292 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30293 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
30294 // to 255, this might point to the last element of in the second operand
30295 // of the shufflevector, thus we are rejecting this transform.
30296 if ((unsigned)Index >= MaxOffset)
30297 return SDValue();
30298 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
30299 }
30300
30301 // Choosing an out-of-range index leads to the lane being zeroed vs zero
30302 // value where it would perform first lane duplication for out of
30303 // index elements. For i8 elements an out-of-range index could be a valid
30304 // for 2048-bit vector register size.
30305 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
30306 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
30307 if (!MinMaxEqual)
30308 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30309 }
30310
30311 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
30312 SDValue VecMask =
30313 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30314 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
30315
30316 SDValue Shuffle;
30317 if (IsSingleOp)
30318 Shuffle =
30319 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30320 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
30321 Op1, SVEMask);
30322 else if (Subtarget.hasSVE2()) {
30323 if (!MinMaxEqual) {
30324 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
30325 SDValue VScale = (BitsPerElt == 64)
30326 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
30327 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
30328 SDValue VecMask =
30329 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30330 SDValue MulByMask = DAG.getNode(
30331 ISD::MUL, DL, MaskType,
30332 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
30333 DAG.getBuildVector(MaskType, DL,
30334 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
30335 SDValue UpdatedVecMask =
30336 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
30337 SVEMask = convertToScalableVector(
30338 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
30339 }
30340 Shuffle =
30341 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30342 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
30343 Op1, Op2, SVEMask);
30344 }
30345 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
30346 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
30347}
30348
30349SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
30350 SDValue Op, SelectionDAG &DAG) const {
30351 EVT VT = Op.getValueType();
30352 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30353
30354 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
30355 auto ShuffleMask = SVN->getMask();
30356
30357 SDLoc DL(Op);
30358 SDValue Op1 = Op.getOperand(0);
30359 SDValue Op2 = Op.getOperand(1);
30360
30361 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30362 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
30363 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
30364
30365 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
30366 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
30367 return MVT::i32;
30368 return ScalarTy;
30369 };
30370
30371 if (SVN->isSplat()) {
30372 unsigned Lane = std::max(0, SVN->getSplatIndex());
30373 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30374 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30375 DAG.getConstant(Lane, DL, MVT::i64));
30376 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
30377 return convertFromScalableVector(DAG, VT, Op);
30378 }
30379
30380 bool ReverseEXT = false;
30381 unsigned Imm;
30382 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
30383 Imm == VT.getVectorNumElements() - 1) {
30384 if (ReverseEXT)
30385 std::swap(Op1, Op2);
30386 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30387 SDValue Scalar = DAG.getNode(
30388 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30389 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
30390 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
30391 return convertFromScalableVector(DAG, VT, Op);
30392 }
30393
30394 unsigned EltSize = VT.getScalarSizeInBits();
30395 for (unsigned BlockSize : {64U, 32U, 16U}) {
30396 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
30397 unsigned RevOp;
30398 if (EltSize == 8)
30399 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
30400 else if (EltSize == 16)
30401 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
30402 else
30403 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
30404 EVT BlockedVT =
30406 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
30407 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
30408 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
30409 DAG.getUNDEF(BlockedVT));
30410 SDValue Container =
30411 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
30412 return convertFromScalableVector(DAG, VT, Container);
30413 }
30414 }
30415
30416 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
30417 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
30418 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30419 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
30420 Pg, Op1, DAG.getUNDEF(ContainerVT));
30421 return convertFromScalableVector(DAG, VT, Revd);
30422 }
30423
30424 unsigned WhichResult;
30425 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30426 WhichResult == 0)
30428 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
30429
30430 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30431 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30433 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30434 }
30435
30436 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
30438 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
30439
30440 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30441 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30443 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30444 }
30445
30446 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
30447 // represents the same logical operation as performed by a ZIP instruction. In
30448 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
30449 // equivalent to an AArch64 instruction. There's the extra component of
30450 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
30451 // only operated on 64/128bit vector types that have a direct mapping to a
30452 // target register and so an exact mapping is implied.
30453 // However, when using SVE for fixed length vectors, most legal vector types
30454 // are actually sub-vectors of a larger SVE register. When mapping
30455 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
30456 // how the mask's indices translate. Specifically, when the mapping requires
30457 // an exact meaning for a specific vector index (e.g. Index X is the last
30458 // vector element in the register) then such mappings are often only safe when
30459 // the exact SVE register size is know. The main exception to this is when
30460 // indices are logically relative to the first element of either
30461 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
30462 // when converting from fixed-length to scalable vector types (i.e. the start
30463 // of a fixed length vector is always the start of a scalable vector).
30464 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
30465 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
30466 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
30467 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
30468 Op2.isUndef()) {
30469 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
30470 return convertFromScalableVector(DAG, VT, Op);
30471 }
30472
30473 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30474 WhichResult != 0)
30476 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
30477
30478 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30479 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30481 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30482 }
30483
30484 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
30486 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
30487
30488 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30489 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30491 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30492 }
30493
30494 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
30495 Subtarget->isSVEorStreamingSVEAvailable()) {
30497 "Unsupported SVE vector size");
30498
30500 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
30501 if (std::optional<unsigned> Lane =
30502 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
30503 SDValue IID =
30504 DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
30506 DAG, VT,
30507 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30508 {IID, Op1,
30509 DAG.getConstant(*Lane, DL, MVT::i64,
30510 /*isTarget=*/true)}));
30511 }
30512 }
30513 }
30514
30515 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
30516 // This may allow the shuffle to be matched as something cheaper like ZIP1.
30517 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
30518 return WideOp;
30519
30520 // Avoid producing TBL instruction if we don't know SVE register minimal size,
30521 // unless NEON is not available and we can assume minimal SVE register size is
30522 // 128-bits.
30523 if (MinSVESize || !Subtarget->isNeonAvailable())
30524 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
30525 DAG);
30526
30527 return SDValue();
30528}
30529
30530SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
30531 SelectionDAG &DAG) const {
30532 SDLoc DL(Op);
30533 EVT InVT = Op.getValueType();
30534
30535 assert(VT.isScalableVector() && isTypeLegal(VT) &&
30536 InVT.isScalableVector() && isTypeLegal(InVT) &&
30537 "Only expect to cast between legal scalable vector types!");
30538 assert(VT.getVectorElementType() != MVT::i1 &&
30539 InVT.getVectorElementType() != MVT::i1 &&
30540 "For predicate bitcasts, use getSVEPredicateBitCast");
30541
30542 if (InVT == VT)
30543 return Op;
30544
30546 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
30547
30548 // Safe bitcasting between unpacked vector types of different element counts
30549 // is currently unsupported because the following is missing the necessary
30550 // work to ensure the result's elements live where they're supposed to within
30551 // an SVE register.
30552 // 01234567
30553 // e.g. nxv2i32 = XX??XX??
30554 // nxv4f16 = X?X?X?X?
30556 VT == PackedVT || InVT == PackedInVT) &&
30557 "Unexpected bitcast!");
30558
30559 // Pack input if required.
30560 if (InVT != PackedInVT)
30561 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
30562
30563 if (Subtarget->isLittleEndian() ||
30564 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
30565 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
30566 else {
30567 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
30568 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
30569
30570 // Simulate the effect of casting through memory.
30571 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
30572 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
30573 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
30574 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
30575 if (PackedVTAsInt.getScalarSizeInBits() != 8)
30576 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
30577 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
30578 }
30579
30580 // Unpack result if required.
30581 if (VT != PackedVT)
30582 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
30583
30584 return Op;
30585}
30586
30588 SDValue N) const {
30589 return ::isAllActivePredicate(DAG, N);
30590}
30591
30593 return ::getPromotedVTForPredicate(VT);
30594}
30595
30596bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
30597 SDValue Op, const APInt &OriginalDemandedBits,
30598 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
30599 unsigned Depth) const {
30600
30601 unsigned Opc = Op.getOpcode();
30602 switch (Opc) {
30603 case AArch64ISD::VSHL: {
30604 // Match (VSHL (VLSHR Val X) X)
30605 SDValue ShiftL = Op;
30606 SDValue ShiftR = Op->getOperand(0);
30607 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
30608 return false;
30609
30610 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
30611 return false;
30612
30613 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
30614 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
30615
30616 // Other cases can be handled as well, but this is not
30617 // implemented.
30618 if (ShiftRBits != ShiftLBits)
30619 return false;
30620
30621 unsigned ScalarSize = Op.getScalarValueSizeInBits();
30622 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
30623
30624 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
30625 APInt UnusedBits = ~OriginalDemandedBits;
30626
30627 if ((ZeroBits & UnusedBits) != ZeroBits)
30628 return false;
30629
30630 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
30631 // used - simplify to just Val.
30632 return TLO.CombineTo(Op, ShiftR->getOperand(0));
30633 }
30634 case AArch64ISD::BICi: {
30635 // Fold BICi if all destination bits already known to be zeroed
30636 SDValue Op0 = Op.getOperand(0);
30637 KnownBits KnownOp0 =
30638 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
30639 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
30640 APInt BitsToClear =
30641 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
30642 .trunc(KnownOp0.getBitWidth());
30643 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
30644 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
30645 return TLO.CombineTo(Op, Op0);
30646
30647 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
30648 return false;
30649 }
30651 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
30652 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
30653 if (!MaxSVEVectorSizeInBits)
30654 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
30655 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
30656 // The SVE count intrinsics don't support the multiplier immediate so we
30657 // don't have to account for that here. The value returned may be slightly
30658 // over the true required bits, as this is based on the "ALL" pattern. The
30659 // other patterns are also exposed by these intrinsics, but they all
30660 // return a value that's strictly less than "ALL".
30661 unsigned RequiredBits = llvm::bit_width(MaxElements);
30662 unsigned BitWidth = Known.Zero.getBitWidth();
30663 if (RequiredBits < BitWidth)
30664 Known.Zero.setHighBits(BitWidth - RequiredBits);
30665 return false;
30666 }
30667 }
30668 }
30669
30671 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
30672}
30673
30674bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
30675 return Op.getOpcode() == AArch64ISD::DUP ||
30676 Op.getOpcode() == AArch64ISD::MOVI ||
30677 Op.getOpcode() == AArch64ISD::MOVIshift ||
30678 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
30679 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
30681}
30682
30684 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
30685 Subtarget->hasComplxNum();
30686}
30687
30690 auto *VTy = dyn_cast<VectorType>(Ty);
30691 if (!VTy)
30692 return false;
30693
30694 // If the vector is scalable, SVE is enabled, implying support for complex
30695 // numbers. Otherwise, we need to ensure complex number support is available
30696 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
30697 return false;
30698
30699 auto *ScalarTy = VTy->getScalarType();
30700 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
30701
30702 // We can only process vectors that have a bit size of 128 or higher (with an
30703 // additional 64 bits for Neon). Additionally, these vectors must have a
30704 // power-of-2 size, as we later split them into the smallest supported size
30705 // and merging them back together after applying complex operation.
30706 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
30707 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
30708 !llvm::isPowerOf2_32(VTyWidth))
30709 return false;
30710
30711 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
30712 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
30713
30715 return ScalarWidth == 32 || ScalarWidth == 64;
30716 return 8 <= ScalarWidth && ScalarWidth <= 64;
30717 }
30718
30719 // CDot is not supported outside of scalable/sve scopes
30721 return false;
30722
30723 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
30724 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
30725}
30726
30729 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
30730 Value *Accumulator) const {
30731 VectorType *Ty = cast<VectorType>(InputA->getType());
30732 if (Accumulator == nullptr)
30734 bool IsScalable = Ty->isScalableTy();
30735 bool IsInt = Ty->getElementType()->isIntegerTy();
30736
30737 unsigned TyWidth =
30739
30740 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
30741 "Vector type must be either 64 or a power of 2 that is at least 128");
30742
30743 if (TyWidth > 128) {
30744 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
30745 int AccStride = cast<VectorType>(Accumulator->getType())
30746 ->getElementCount()
30747 .getKnownMinValue() /
30748 2;
30749 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
30750 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
30751 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
30752 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
30753 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
30754 Value *LowerSplitAcc = nullptr;
30755 Value *UpperSplitAcc = nullptr;
30756 Type *FullTy = Ty;
30757 FullTy = Accumulator->getType();
30758 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
30759 cast<VectorType>(Accumulator->getType()));
30760 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
30761 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
30762 auto *LowerSplitInt = createComplexDeinterleavingIR(
30763 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
30764 auto *UpperSplitInt = createComplexDeinterleavingIR(
30765 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
30766
30767 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
30768 LowerSplitInt, uint64_t(0));
30769 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
30770 }
30771
30772 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
30773 if (IsScalable) {
30774 if (IsInt)
30775 return B.CreateIntrinsic(
30776 Intrinsic::aarch64_sve_cmla_x, Ty,
30777 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
30778
30779 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
30780 return B.CreateIntrinsic(
30781 Intrinsic::aarch64_sve_fcmla, Ty,
30782 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
30783 }
30784
30785 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
30786 Intrinsic::aarch64_neon_vcmla_rot90,
30787 Intrinsic::aarch64_neon_vcmla_rot180,
30788 Intrinsic::aarch64_neon_vcmla_rot270};
30789
30790
30791 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
30792 {Accumulator, InputA, InputB});
30793 }
30794
30795 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
30796 if (IsScalable) {
30799 if (IsInt)
30800 return B.CreateIntrinsic(
30801 Intrinsic::aarch64_sve_cadd_x, Ty,
30802 {InputA, InputB, B.getInt32((int)Rotation * 90)});
30803
30804 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
30805 return B.CreateIntrinsic(
30806 Intrinsic::aarch64_sve_fcadd, Ty,
30807 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
30808 }
30809 return nullptr;
30810 }
30811
30814 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
30816 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
30817
30818 if (IntId == Intrinsic::not_intrinsic)
30819 return nullptr;
30820
30821 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
30822 }
30823
30824 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
30825 IsScalable) {
30826 return B.CreateIntrinsic(
30827 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
30828 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
30829 }
30830
30831 return nullptr;
30832}
30833
30834bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
30835 unsigned Opc = N->getOpcode();
30836 if (ISD::isExtOpcode(Opc)) {
30837 if (any_of(N->users(),
30838 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
30839 return false;
30840 }
30841 return true;
30842}
30843
30844unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
30845 return Subtarget->getMinimumJumpTableEntries();
30846}
30847
30849 CallingConv::ID CC,
30850 EVT VT) const {
30851 bool NonUnitFixedLengthVector =
30853 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
30855
30856 EVT VT1;
30857 MVT RegisterVT;
30858 unsigned NumIntermediates;
30859 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
30860 RegisterVT);
30861 return RegisterVT;
30862}
30863
30865 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
30866 bool NonUnitFixedLengthVector =
30868 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
30870
30871 EVT VT1;
30872 MVT VT2;
30873 unsigned NumIntermediates;
30875 NumIntermediates, VT2);
30876}
30877
30879 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
30880 unsigned &NumIntermediates, MVT &RegisterVT) const {
30882 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
30883 if (!RegisterVT.isFixedLengthVector() ||
30884 RegisterVT.getFixedSizeInBits() <= 128)
30885 return NumRegs;
30886
30887 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
30888 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
30889 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
30890
30891 // A size mismatch here implies either type promotion or widening and would
30892 // have resulted in scalarisation if larger vectors had not be available.
30893 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
30894 EVT EltTy = VT.getVectorElementType();
30896 if (!isTypeLegal(NewVT))
30897 NewVT = EltTy;
30898
30899 IntermediateVT = NewVT;
30900 NumIntermediates = VT.getVectorNumElements();
30901 RegisterVT = getRegisterType(Context, NewVT);
30902 return NumIntermediates;
30903 }
30904
30905 // SVE VLS support does not introduce a new ABI so we should use NEON sized
30906 // types for vector arguments and returns.
30907
30908 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
30909 NumIntermediates *= NumSubRegs;
30910 NumRegs *= NumSubRegs;
30911
30912 switch (RegisterVT.getVectorElementType().SimpleTy) {
30913 default:
30914 llvm_unreachable("unexpected element type for vector");
30915 case MVT::i8:
30916 IntermediateVT = RegisterVT = MVT::v16i8;
30917 break;
30918 case MVT::i16:
30919 IntermediateVT = RegisterVT = MVT::v8i16;
30920 break;
30921 case MVT::i32:
30922 IntermediateVT = RegisterVT = MVT::v4i32;
30923 break;
30924 case MVT::i64:
30925 IntermediateVT = RegisterVT = MVT::v2i64;
30926 break;
30927 case MVT::f16:
30928 IntermediateVT = RegisterVT = MVT::v8f16;
30929 break;
30930 case MVT::f32:
30931 IntermediateVT = RegisterVT = MVT::v4f32;
30932 break;
30933 case MVT::f64:
30934 IntermediateVT = RegisterVT = MVT::v2f64;
30935 break;
30936 case MVT::bf16:
30937 IntermediateVT = RegisterVT = MVT::v8bf16;
30938 break;
30939 }
30940
30941 return NumRegs;
30942}
30943
30945 const MachineFunction &MF) const {
30946 return !Subtarget->isTargetWindows() &&
30947 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
30948}
30949
30951 switch (Opc) {
30955 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
30956 return true;
30957 }
30958
30960}
30961
30963 EVT VT) const {
30964 return Subtarget->hasCPA() && UseFEATCPACodegen;
30965}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
SDValue tryLowerPartialReductionToWideAdd(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static unsigned getFPSubregForVT(EVT VT)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineStoreValueFPToInt(StoreSDNode *ST, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
Register const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setSMESaveBufferUsed(bool Used=true)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
std::optional< uint16_t > getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const
Compute the integer discriminator for a given BlockAddress constant, if blockaddress signing is enabl...
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool isStreamingSVEAvailable() const
Returns true if the target has access to the streaming-compatible subset of SVE instructions.
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool isNonStreamingSVEorSME2Available() const
Returns true if the target has access to either the full range of SVE instructions,...
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override
Return true if the @llvm.experimental.vector.partial.reduce.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1166
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1041
unsigned logBase2() const
Definition: APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
an instruction to allocate memory on the stack
Definition: Instructions.h:64
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
Definition: Instructions.h:765
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ And
*p = old & v
Definition: Instructions.h:729
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
Definition: Instructions.h:761
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
bool isFloatingPointOperation() const
Definition: Instructions.h:898
BinOp getOperation() const
Definition: Instructions.h:819
This is an SDNode representing atomic operations.
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
const BlockAddress * getBlockAddress() const
The address of a basic block.
Definition: Constants.h:899
Function * getFunction() const
Definition: Constants.h:935
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
unsigned arg_size() const
Definition: InstrTypes.h:1290
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:198
bool isBigEndian() const
Definition: DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
A debug info location.
Definition: DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:315
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:323
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:604
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
bool empty() const
Definition: Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1036
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
arg_iterator arg_end()
Definition: Function.h:875
arg_iterator arg_begin()
Definition: Function.h:866
size_t size() const
Definition: Function.h:856
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:265
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:531
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:132
Type * getValueType() const
Definition: GlobalValue.h:298
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2214
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1093
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1936
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1107
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2251
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:502
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2128
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:512
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2142
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:533
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
LLVMContext & getContext() const
Definition: IRBuilder.h:203
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2194
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1532
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:56
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:265
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:101
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:201
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:180
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:56
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:710
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:352
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:146
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
bool isAssert() const
Test if this node is an assert operation.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
bool hasZT0State() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:639
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:825
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition: SelectionDAG.h:941
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:506
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:902
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:808
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:885
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:777
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:287
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:34
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:480
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:581
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:269
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition: StringRef.h:619
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:694
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:281
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Class to represent struct types.
Definition: DerivedTypes.h:218
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:686
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:264
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
@ HalfTyID
16-bit floating point type
Definition: Type.h:56
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition: Type.h:57
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:311
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5465
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:534
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:481
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:499
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:255
self_iterator getIterator()
Definition: ilist_node.h:134
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:271
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:504
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1386
@ PARTIAL_REDUCE_SMLA
Definition: ISDOpcodes.h:1510
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1458
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1401
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1491
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ STRICT_FATAN2
Definition: ISDOpcodes.h:441
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ STRICT_FCEIL
Definition: ISDOpcodes.h:454
@ STRICT_FTANH
Definition: ISDOpcodes.h:444
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1131
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ VECTOR_FIND_LAST_ACTIVE
Definition: ISDOpcodes.h:1550
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
Definition: ISDOpcodes.h:1098
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:1020
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
Definition: ISDOpcodes.h:1094
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1476
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1480
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1135
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1490
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:505
@ STRICT_FLOG2
Definition: ISDOpcodes.h:449
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1309
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1568
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1384
@ GlobalTLSAddress
Definition: ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ PARTIAL_REDUCE_UMLA
Definition: ISDOpcodes.h:1511
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ STRICT_FASIN
Definition: ISDOpcodes.h:438
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:117
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1473
@ STRICT_FATAN
Definition: ISDOpcodes.h:440
@ WRITE_REGISTER
Definition: ISDOpcodes.h:135
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ TRUNCATE_SSAT_U
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1477
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:809
@ STRICT_LROUND
Definition: ISDOpcodes.h:459
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1162
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition: ISDOpcodes.h:622
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:682
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ STRICT_FPOWI
Definition: ISDOpcodes.h:433
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1492
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:663
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1387
@ GET_ACTIVE_LANE_MASK
Definition: ISDOpcodes.h:1559
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:458
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1485
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ ATOMIC_LOAD_FMAXIMUM
Definition: ISDOpcodes.h:1388
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1126
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1376
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:48
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:452
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:453
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ STRICT_FSINH
Definition: ISDOpcodes.h:442
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1448
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1325
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ ATOMIC_LOAD_FMINIMUM
Definition: ISDOpcodes.h:1389
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ STRICT_LRINT
Definition: ISDOpcodes.h:461
@ ConstantPool
Definition: ISDOpcodes.h:92
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:627
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ STRICT_FROUND
Definition: ISDOpcodes.h:456
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:477
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1413
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1493
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:455
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ STRICT_FCOSH
Definition: ISDOpcodes.h:443
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:420
@ STRICT_FLOG10
Definition: ISDOpcodes.h:448
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ STRICT_LLRINT
Definition: ISDOpcodes.h:462
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:648
@ STRICT_FEXP2
Definition: ISDOpcodes.h:446
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:690
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:122
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ STRICT_LLROUND
Definition: ISDOpcodes.h:460
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:903
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1546
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1481
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ BlockAddress
Definition: ISDOpcodes.h:94
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ PARTIAL_REDUCE_SUMLA
Definition: ISDOpcodes.h:1512
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ STRICT_FRINT
Definition: ISDOpcodes.h:450
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition: ISDOpcodes.h:611
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition: ISDOpcodes.h:853
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:713
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1315
@ TRUNCATE_USAT_U
Definition: ISDOpcodes.h:857
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ STRICT_FACOS
Definition: ISDOpcodes.h:439
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1756
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1647
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1634
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1665
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1636
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition: ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:216
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:355
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:252
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1587
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition: Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
unsigned M1(unsigned Val)
Definition: VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:270
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2139
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:257
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
static const MachineMemOperand::Flags MOStridedAccess
@ Default
The result values are uniform if and only if all operands are uniform.
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition: Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:324
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:397
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:439
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:187
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:427
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:154
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:304
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:803
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:128
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Matching combinators.
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:249
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64