LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
60#include "llvm/IR/Attributes.h"
61#include "llvm/IR/Constants.h"
62#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/IR/Function.h"
67#include "llvm/IR/GlobalValue.h"
68#include "llvm/IR/IRBuilder.h"
69#include "llvm/IR/Instruction.h"
72#include "llvm/IR/Intrinsics.h"
73#include "llvm/IR/IntrinsicsAArch64.h"
74#include "llvm/IR/Module.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
298 case ISD::GET_ACTIVE_LANE_MASK:
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
405 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add legal sve data types
449 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
451 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
452 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
453
454 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
456 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
457 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
460
461 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
463 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
464
465 if (Subtarget->useSVEForFixedLengthVectors()) {
468 addRegisterClass(VT, &AArch64::ZPRRegClass);
469
472 addRegisterClass(VT, &AArch64::ZPRRegClass);
473 }
474 }
475
476 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
477 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
478 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
479 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
480
481 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
482 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
483 }
484
485 // Compute derived properties from the register classes
486 computeRegisterProperties(Subtarget->getRegisterInfo());
487
488 // Provide all sorts of operation actions
506 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
507 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
508 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
509 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
510 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
511 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
514 if (Subtarget->hasFPARMv8()) {
517 }
526 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
528 setOperationAction(ISD::BRIND, MVT::Other, Custom);
530
532
536
540
542
543 // Custom lowering hooks are needed for XOR
544 // to fold it into CSINC/CSINV.
547
548 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
549 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
550
551 // Virtually no operation on f128 is legal, but LLVM can't expand them when
552 // there's a valid register class, so we need custom operations in most cases.
553 setOperationAction(ISD::FABS, MVT::f128, Expand);
556 setOperationAction(ISD::FCOS, MVT::f128, Expand);
560 setOperationAction(ISD::FNEG, MVT::f128, Expand);
561 setOperationAction(ISD::FPOW, MVT::f128, Expand);
563 setOperationAction(ISD::FRINT, MVT::f128, Expand);
564 setOperationAction(ISD::FSIN, MVT::f128, Expand);
565 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
566 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
568 setOperationAction(ISD::FTAN, MVT::f128, Expand);
569 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
573 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
576 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
577 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
578 // aren't handled.
579
580 // Lowering for many of the conversions is actually specified by the non-f128
581 // type. The LowerXXX function will be trivial when f128 isn't involved.
606 if (Subtarget->hasFPARMv8()) {
609 }
612 if (Subtarget->hasFPARMv8()) {
615 }
618
623
624 // Variable arguments.
625 setOperationAction(ISD::VASTART, MVT::Other, Custom);
626 setOperationAction(ISD::VAARG, MVT::Other, Custom);
627 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
628 setOperationAction(ISD::VAEND, MVT::Other, Expand);
629
630 // Variable-sized objects.
631 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
632 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
633
634 // Lowering Funnel Shifts to EXTR
639
640 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
641
642 // Constant pool entries
644
645 // BlockAddress
647
648 // AArch64 lacks both left-rotate and popcount instructions.
654 }
655
656 // AArch64 doesn't have i32 MULH{S|U}.
659
660 // AArch64 doesn't have {U|S}MUL_LOHI.
665
666 if (Subtarget->hasCSSC()) {
670
672
676
679
684
689 } else {
693
696
699 }
700
706 }
713
714 // Custom lower Add/Sub/Mul with overflow.
727
736
737 setOperationAction(ISD::FSIN, MVT::f32, Expand);
738 setOperationAction(ISD::FSIN, MVT::f64, Expand);
739 setOperationAction(ISD::FCOS, MVT::f32, Expand);
740 setOperationAction(ISD::FCOS, MVT::f64, Expand);
741 setOperationAction(ISD::FPOW, MVT::f32, Expand);
742 setOperationAction(ISD::FPOW, MVT::f64, Expand);
745 if (Subtarget->hasFullFP16()) {
748 } else {
751 }
752
753 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
754 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
755 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
756 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
757 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
758 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
759 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
767 setOperationAction(Op, MVT::f16, Promote);
768 setOperationAction(Op, MVT::v4f16, Expand);
769 setOperationAction(Op, MVT::v8f16, Expand);
770 setOperationAction(Op, MVT::bf16, Promote);
771 setOperationAction(Op, MVT::v4bf16, Expand);
772 setOperationAction(Op, MVT::v8bf16, Expand);
773 }
774
775 // Legalize fcanonicalize to circumvent default expansion
776 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
777 if (Subtarget->hasFullFP16()) {
779 }
780
781 // fpextend from f16 or bf16 to f32 is legal
782 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
783 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
786 // fpextend from bf16 to f64 needs to be split into two fpextends
787 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
789
790 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
791 for (auto Op : {
794 ISD::BR_CC,
795 ISD::FADD,
796 ISD::FSUB,
797 ISD::FMUL,
798 ISD::FDIV,
799 ISD::FMA,
800 ISD::FCEIL,
801 ISD::FSQRT,
802 ISD::FFLOOR,
803 ISD::FNEARBYINT,
804 ISD::FRINT,
805 ISD::FROUND,
806 ISD::FROUNDEVEN,
807 ISD::FTRUNC,
808 ISD::FMINNUM,
809 ISD::FMAXNUM,
810 ISD::FMINIMUM,
811 ISD::FMAXIMUM,
812 ISD::FMINIMUMNUM,
813 ISD::FMAXIMUMNUM,
832 })
833 setOperationAction(Op, ScalarVT, Promote);
834
835 for (auto Op : {ISD::FNEG, ISD::FABS})
836 setOperationAction(Op, ScalarVT, Legal);
837
838 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
839 // because the result type is integer.
840 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
843 setOperationAction(Op, ScalarVT, Custom);
844
845 // promote v4f16 to v4f32 when that is known to be safe.
846 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
847 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
854 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
855 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
856 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
857 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
858 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
859 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
860
861 setOperationAction(ISD::FABS, V4Narrow, Legal);
862 setOperationAction(ISD::FNEG, V4Narrow, Legal);
864 setOperationAction(ISD::BR_CC, V4Narrow, Expand);
868 setOperationAction(ISD::FSQRT, V4Narrow, Expand);
869
870 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
871 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
872 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
873
874 setOperationAction(ISD::FABS, V8Narrow, Legal);
876 setOperationAction(ISD::FCEIL, V8Narrow, Legal);
879 setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
882 setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
883 setOperationAction(ISD::FNEG, V8Narrow, Legal);
884 setOperationAction(ISD::FROUND, V8Narrow, Legal);
885 setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
886 setOperationAction(ISD::FRINT, V8Narrow, Legal);
887 setOperationAction(ISD::FSQRT, V8Narrow, Expand);
889 setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
890 setOperationAction(ISD::BR_CC, V8Narrow, Expand);
893 setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
894 };
895
896 if (!Subtarget->hasFullFP16()) {
897 LegalizeNarrowFP(MVT::f16);
898 }
899 LegalizeNarrowFP(MVT::bf16);
902
903 // AArch64 has implementations of a lot of rounding-like FP operations.
904 // clang-format off
905 for (auto Op :
906 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
907 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
908 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
909 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
910 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
911 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
917 for (MVT Ty : {MVT::f32, MVT::f64})
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, MVT::f16, Legal);
921 }
922 // clang-format on
923
924 // Basic strict FP operations are legal
927 for (MVT Ty : {MVT::f32, MVT::f64})
929 if (Subtarget->hasFullFP16())
930 setOperationAction(Op, MVT::f16, Legal);
931 }
932
933 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
934
936 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
937 setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
938 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
939 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
940
941 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
942 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
943 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
944 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
945 } else {
946 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
947 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
948 }
949 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
950 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
951
952 // Generate outline atomics library calls only if LSE was not specified for
953 // subtarget
954 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
955 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
956 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
957 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
958 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
959 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
960 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
961 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
962 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
963 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
964 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
965 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
966 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
967 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
968 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
969 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
970 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
971 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
972 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
973 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
974 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
975 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
976 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
977 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
978 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
979 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
980 }
981
982 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
983 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);
984 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);
985 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);
986 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);
987
988 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);
989 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);
990 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);
991 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);
992
993 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);
994 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);
995 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);
996 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);
997
998 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);
999 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);
1000 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);
1001 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);
1002
1003 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);
1004 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);
1005 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);
1006 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);
1007 }
1008
1009 if (Subtarget->hasLSE128()) {
1010 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1011 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1012 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
1013 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
1014 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
1015 }
1016
1017 // 128-bit loads and stores can be done without expanding
1018 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1019 setOperationAction(ISD::STORE, MVT::i128, Custom);
1020
1021 // Aligned 128-bit loads and stores are single-copy atomic according to the
1022 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1023 if (Subtarget->hasLSE2()) {
1024 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1025 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1026 }
1027
1028 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1029 // custom lowering, as there are no un-paired non-temporal stores and
1030 // legalization will break up 256 bit inputs.
1031 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1035 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1036 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1037 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1038 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1039
1040 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1041 // custom lowering, as there are no un-paired non-temporal loads legalization
1042 // will break up 256 bit inputs.
1043 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1045 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1046 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1047 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1048 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1049 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1050 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1051
1052 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1053 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1054
1055 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057 // Issue __sincos_stret if available.
1058 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1059 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1060 } else {
1061 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1062 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1063 }
1064
1065 // Make floating-point constants legal for the large code model, so they don't
1066 // become loads from the constant pool.
1067 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1070 }
1071
1072 // AArch64 does not have floating-point extending loads, i1 sign-extending
1073 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1074 for (MVT VT : MVT::fp_valuetypes()) {
1075 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1076 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1077 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1078 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1079 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1080 }
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1083
1084 for (MVT WideVT : MVT::fp_valuetypes()) {
1085 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1086 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1087 setTruncStoreAction(WideVT, NarrowVT, Expand);
1088 }
1089 }
1090 }
1091
1092 if (Subtarget->hasFPARMv8()) {
1093 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1094 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1095 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1096 }
1097
1098 // Indexed loads and stores are supported.
1099 for (unsigned im = (unsigned)ISD::PRE_INC;
1101 setIndexedLoadAction(im, MVT::i8, Legal);
1102 setIndexedLoadAction(im, MVT::i16, Legal);
1103 setIndexedLoadAction(im, MVT::i32, Legal);
1104 setIndexedLoadAction(im, MVT::i64, Legal);
1105 setIndexedLoadAction(im, MVT::f64, Legal);
1106 setIndexedLoadAction(im, MVT::f32, Legal);
1107 setIndexedLoadAction(im, MVT::f16, Legal);
1108 setIndexedLoadAction(im, MVT::bf16, Legal);
1109 setIndexedStoreAction(im, MVT::i8, Legal);
1110 setIndexedStoreAction(im, MVT::i16, Legal);
1111 setIndexedStoreAction(im, MVT::i32, Legal);
1112 setIndexedStoreAction(im, MVT::i64, Legal);
1113 setIndexedStoreAction(im, MVT::f64, Legal);
1114 setIndexedStoreAction(im, MVT::f32, Legal);
1115 setIndexedStoreAction(im, MVT::f16, Legal);
1116 setIndexedStoreAction(im, MVT::bf16, Legal);
1117 }
1118
1119 // Trap.
1120 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1121 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1122 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1123
1124 // We combine OR nodes for ccmp operations.
1126 // Try to create BICs for vector ANDs.
1128
1129 // llvm.init.trampoline and llvm.adjust.trampoline
1130 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1131 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1132
1133 // Vector add and sub nodes may conceal a high-half opportunity.
1134 // Also, try to fold ADD into CSINC/CSINV..
1137
1140
1141 // Try and combine setcc with csel
1143
1145
1149 ISD::STORE, ISD::BUILD_VECTOR});
1152 setTargetDAGCombine(ISD::LOAD);
1153
1154 setTargetDAGCombine(ISD::MSTORE);
1155
1157
1159
1162 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1163
1165 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1166
1167 setTargetDAGCombine(ISD::FP_EXTEND);
1168
1170
1172
1173 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1174
1175 setTargetDAGCombine(ISD::VECREDUCE_AND);
1176 setTargetDAGCombine(ISD::VECREDUCE_OR);
1177 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1178
1180
1183
1184 // In case of strict alignment, avoid an excessive number of byte wide stores.
1187 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1188
1192 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1193
1196 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1197
1200 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1201
1203
1205
1206 EnableExtLdPromotion = true;
1207
1208 // Set required alignment.
1210 // Set preferred alignments.
1211
1212 // Don't align loops on Windows. The SEH unwind info generation needs to
1213 // know the exact length of functions before the alignments have been
1214 // expanded.
1215 if (!Subtarget->isTargetWindows())
1219
1220 // Only change the limit for entries in a jump table if specified by
1221 // the sub target, but not at the command line.
1222 unsigned MaxJT = STI.getMaximumJumpTableSize();
1223 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1225
1227
1229
1231 if (Subtarget->hasSME())
1233
1234 if (Subtarget->isNeonAvailable()) {
1235 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1236 // silliness like this:
1237 // clang-format off
1238 for (auto Op :
1239 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1240 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1242 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1243 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1244 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1245 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1246 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1247 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1248 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1249 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1250 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1251 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1252 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1259 setOperationAction(Op, MVT::v1f64, Expand);
1260 // clang-format on
1261
1262 for (auto Op :
1267 setOperationAction(Op, MVT::v1i64, Expand);
1268
1269 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1270 // elements smaller than i32, so promote the input to i32 first.
1271 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1272 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1273
1274 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1275 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1276 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1279 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1281
1282 if (Subtarget->hasFullFP16()) {
1285
1294 } else {
1295 // when AArch64 doesn't have fullfp16 support, promote the input
1296 // to i32 first.
1297 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1298 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1299 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1300 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1301 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1302 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1303 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1304 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1305 }
1306
1307 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1308 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1315 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1320 }
1321
1322 // Custom handling for some quad-vector types to detect MULL.
1323 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1324 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1325 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1326 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1327 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1328 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1329
1330 // Saturates
1331 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1332 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1337 }
1338
1339 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1340 MVT::v4i32}) {
1347 }
1348
1349 // Vector reductions
1350 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1351 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1352 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1353 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1354 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1355 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1356 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1357
1358 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1359 }
1360 }
1361 if (Subtarget->hasFullFP16())
1362 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
1363
1364 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1365 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1366 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1367 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1368 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1369 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1370 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1371 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1372 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1373 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1374 }
1375 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1376 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1377 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1378 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1379
1381 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1382 // Likewise, narrowing and extending vector loads/stores aren't handled
1383 // directly.
1386
1387 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1390 } else {
1393 }
1396
1399
1400 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1401 setTruncStoreAction(VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1403 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1404 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1405 }
1406 }
1407
1408 for (auto Op :
1409 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1410 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1414 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1416 if (Subtarget->hasFullFP16())
1417 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1419 }
1420
1421 // LRINT and LLRINT.
1422 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1423 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1425 if (Subtarget->hasFullFP16())
1426 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1428 }
1429
1430 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1431
1432 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1433 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1434 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1435 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1436
1437 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1438 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1439 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1440
1441 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1442 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1443 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1444 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1445 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1446 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1447
1448 // ADDP custom lowering
1449 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1451 // FADDP custom lowering
1452 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1454
1455 if (Subtarget->hasDotProd()) {
1456 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457 ISD::PARTIAL_REDUCE_UMLA};
1458
1459 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1461 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1462
1463 if (Subtarget->hasMatMulInt8()) {
1464 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,
1465 MVT::v16i8, Legal);
1466 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,
1467 MVT::v16i8, Custom);
1468
1469 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,
1470 MVT::v8i8, Legal);
1471 }
1472 }
1473
1474 } else /* !isNeonAvailable */ {
1476 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1478
1479 if (VT.is128BitVector() || VT.is64BitVector()) {
1480 setOperationAction(ISD::LOAD, VT, Legal);
1481 setOperationAction(ISD::STORE, VT, Legal);
1482 setOperationAction(ISD::BITCAST, VT,
1483 Subtarget->isLittleEndian() ? Legal : Expand);
1484 }
1485 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1486 setTruncStoreAction(VT, InnerVT, Expand);
1487 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1489 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1490 }
1491 }
1492 }
1493
1494 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1498 }
1499
1500 if (Subtarget->hasSME()) {
1502 }
1503
1504 // FIXME: Move lowering for more nodes here if those are common between
1505 // SVE and SME.
1506 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1507 for (auto VT :
1508 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1513 }
1514 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1515 setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
1516 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
1517 }
1518
1519 if (Subtarget->hasSVE2p1() ||
1520 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1521 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);
1522
1523 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1524 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);
1525 }
1526
1527 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1528 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1539 setOperationAction(ISD::MLOAD, VT, Custom);
1540 setOperationAction(ISD::MSTORE, VT, Legal);
1560 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1561 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1562 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1563 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1564 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1565 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1566 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1567 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1570
1576
1585
1590
1591 if (!Subtarget->isLittleEndian())
1592 setOperationAction(ISD::BITCAST, VT, Custom);
1593
1594 if (Subtarget->hasSVE2() ||
1595 (Subtarget->hasSME() && Subtarget->isStreaming()))
1596 // For SLI/SRI.
1598 }
1599
1600 // Illegal unpacked integer vector types.
1601 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1604 }
1605
1606 // Type legalize unpacked bitcasts.
1607 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1608 setOperationAction(ISD::BITCAST, VT, Custom);
1609
1610 for (auto VT :
1611 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1612 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1614
1615 for (auto VT :
1616 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1621 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1622 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1623 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1624
1628
1629 // There are no legal MVT::nxv16f## based types.
1630 if (VT != MVT::nxv16i1) {
1635 }
1636 }
1637
1638 // NEON doesn't support masked loads/stores, but SME and SVE do.
1639 for (auto VT :
1640 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1641 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1642 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1643 setOperationAction(ISD::MLOAD, VT, Custom);
1644 setOperationAction(ISD::MSTORE, VT, Custom);
1645 }
1646
1647 // Firstly, exclude all scalable vector extending loads/truncating stores,
1648 // include both integer and floating scalable vector.
1650 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1651 setTruncStoreAction(VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1653 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1654 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1655 }
1656 }
1657
1658 // Then, selectively enable those which we directly support.
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1660 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1661 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1662 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1663 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1664 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1665 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1667 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1668 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1669 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1670 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1671 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1672 }
1673
1674 // SVE supports truncating stores of 64 and 128-bit vectors
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1676 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1677 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1678 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1679 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1680
1681 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1682 MVT::nxv4f32, MVT::nxv2f64}) {
1683 setOperationAction(ISD::BITCAST, VT, Custom);
1686 setOperationAction(ISD::MLOAD, VT, Custom);
1694 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1695 setOperationAction(ISD::FMAXNUM, VT, Custom);
1696 setOperationAction(ISD::FMINIMUM, VT, Custom);
1697 setOperationAction(ISD::FMINNUM, VT, Custom);
1699 setOperationAction(ISD::FNEG, VT, Custom);
1701 setOperationAction(ISD::FCEIL, VT, Custom);
1702 setOperationAction(ISD::FFLOOR, VT, Custom);
1703 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1704 setOperationAction(ISD::FRINT, VT, Custom);
1705 setOperationAction(ISD::LRINT, VT, Custom);
1706 setOperationAction(ISD::LLRINT, VT, Custom);
1707 setOperationAction(ISD::FROUND, VT, Custom);
1708 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1709 setOperationAction(ISD::FTRUNC, VT, Custom);
1710 setOperationAction(ISD::FSQRT, VT, Custom);
1711 setOperationAction(ISD::FABS, VT, Custom);
1712 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1714 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1715 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1716 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1717 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1718 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1722
1725 setOperationAction(ISD::FPOW, VT, Expand);
1726 setOperationAction(ISD::FPOWI, VT, Expand);
1727 setOperationAction(ISD::FCOS, VT, Expand);
1728 setOperationAction(ISD::FSIN, VT, Expand);
1729 setOperationAction(ISD::FSINCOS, VT, Expand);
1730 setOperationAction(ISD::FTAN, VT, Expand);
1731 setOperationAction(ISD::FACOS, VT, Expand);
1732 setOperationAction(ISD::FASIN, VT, Expand);
1733 setOperationAction(ISD::FATAN, VT, Expand);
1734 setOperationAction(ISD::FATAN2, VT, Expand);
1735 setOperationAction(ISD::FCOSH, VT, Expand);
1736 setOperationAction(ISD::FSINH, VT, Expand);
1737 setOperationAction(ISD::FTANH, VT, Expand);
1738 setOperationAction(ISD::FEXP, VT, Expand);
1739 setOperationAction(ISD::FEXP2, VT, Expand);
1740 setOperationAction(ISD::FEXP10, VT, Expand);
1741 setOperationAction(ISD::FLOG, VT, Expand);
1742 setOperationAction(ISD::FLOG2, VT, Expand);
1743 setOperationAction(ISD::FLOG10, VT, Expand);
1744
1756 }
1757
1758 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1759 setOperationAction(ISD::BITCAST, VT, Custom);
1761 setOperationAction(ISD::FABS, VT, Custom);
1763 setOperationAction(ISD::FNEG, VT, Custom);
1764 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1766 setOperationAction(ISD::MLOAD, VT, Custom);
1774
1775 if (Subtarget->hasSVEB16B16() &&
1776 Subtarget->isNonStreamingSVEorSME2Available()) {
1779 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1780 setOperationAction(ISD::FMAXNUM, VT, Custom);
1781 setOperationAction(ISD::FMINIMUM, VT, Custom);
1782 setOperationAction(ISD::FMINNUM, VT, Custom);
1785 }
1786 }
1787
1788 for (auto Opcode :
1789 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1790 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1791 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1792 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1793 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1794 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1795 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1796 }
1797
1798 if (!Subtarget->hasSVEB16B16() ||
1799 !Subtarget->isNonStreamingSVEorSME2Available()) {
1800 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1801 ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1802 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1803 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1804 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1805 }
1806 }
1807
1810
1811 // NEON doesn't support integer divides, but SVE does
1812 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1813 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1816 }
1817
1818 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1819 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1820 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1821
1822 // NOTE: Currently this has to happen after computeRegisterProperties rather
1823 // than the preferred option of combining it with the addRegisterClass call.
1824 if (Subtarget->useSVEForFixedLengthVectors()) {
1827 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1828 addTypeForFixedLengthSVE(VT);
1829 }
1832 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1833 addTypeForFixedLengthSVE(VT);
1834 }
1835
1836 // 64bit results can mean a bigger than NEON input.
1837 for (auto VT : {MVT::v8i8, MVT::v4i16})
1840
1841 // 128bit results imply a bigger than NEON input.
1842 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1844 for (auto VT : {MVT::v8f16, MVT::v4f32})
1846
1847 // These operations are not supported on NEON but SVE can do them.
1849 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1850 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1851 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1852 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1853 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1854 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1855 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1856 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1857 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1858 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1859 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1860 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1861 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1862 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1863 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1864 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1865 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1866 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1867 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1868
1869 // Int operations with no NEON support.
1870 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1871 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1874 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1875 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1876 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1879 }
1880
1881 // Use SVE for vectors with more than 2 elements.
1882 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1883 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1884 }
1885
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1888 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1889 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1890
1891 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1892
1893 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1895 }
1896
1897 // Handle partial reduction operations
1898 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1899 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1900 // Other pairs will default to 'Expand'.
1901 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1902 ISD::PARTIAL_REDUCE_UMLA};
1903 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1904 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1905
1906 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1907
1908 if (Subtarget->hasMatMulInt8()) {
1909 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,
1910 MVT::nxv16i8, Legal);
1911 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,
1912 MVT::nxv16i8, Custom);
1913 }
1914
1915 // Wide add types
1916 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1918 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1919 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1920 }
1921 }
1922
1923 // Handle non-aliasing elements mask
1924 if (Subtarget->hasSVE2() ||
1925 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1926 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1927 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1930 }
1931 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1934 }
1935 }
1936
1937 // Handle operations that are only available in non-streaming SVE mode.
1938 if (Subtarget->isSVEAvailable()) {
1939 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1940 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1941 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1942 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1943 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1944 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1945 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1946 setOperationAction(ISD::MGATHER, VT, Custom);
1947 setOperationAction(ISD::MSCATTER, VT, Custom);
1948 }
1949
1950 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1951 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1952 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1953 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1954
1955 // We can lower types that have <vscale x {2|4}> elements to compact.
1956 for (auto VT :
1957 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1958 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1960
1961 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1962 // NEON vectors in the lowest bits of the SVE register.
1963 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1964 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1966
1967 // Histcnt is SVE2 only
1968 if (Subtarget->hasSVE2()) {
1969 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
1970 Custom);
1971 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
1972 Custom);
1973
1974 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1975 ISD::PARTIAL_REDUCE_UMLA};
1976 // Must be lowered to SVE instructions.
1977 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1978 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1979 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1980 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1981 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1982 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1983 }
1984 }
1985
1986 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1987 // Only required for llvm.aarch64.mops.memset.tag
1989 }
1990
1992
1993 if (Subtarget->hasSVE()) {
1994 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1995 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1996 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1997 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1998 }
1999
2000 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2001
2002 IsStrictFPEnabled = true;
2004
2005 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2006 // it, but it's just a wrapper around ldexp.
2007 if (Subtarget->isTargetWindows()) {
2008 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2009 if (isOperationExpand(Op, MVT::f32))
2010 setOperationAction(Op, MVT::f32, Promote);
2011 }
2012
2013 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2014 // isn't legal.
2015 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2016 if (isOperationExpand(Op, MVT::f16))
2017 setOperationAction(Op, MVT::f16, Promote);
2018}
2019
2021 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2022}
2023
2024void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2025 assert(VT.isVector() && "VT should be a vector type");
2026
2027 if (VT.isFloatingPoint()) {
2029 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2030 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2031 }
2032
2033 // Mark vector float intrinsics as expand.
2034 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2035 setOperationAction(ISD::FSIN, VT, Expand);
2036 setOperationAction(ISD::FCOS, VT, Expand);
2037 setOperationAction(ISD::FTAN, VT, Expand);
2038 setOperationAction(ISD::FASIN, VT, Expand);
2039 setOperationAction(ISD::FACOS, VT, Expand);
2040 setOperationAction(ISD::FATAN, VT, Expand);
2041 setOperationAction(ISD::FATAN2, VT, Expand);
2042 setOperationAction(ISD::FSINH, VT, Expand);
2043 setOperationAction(ISD::FCOSH, VT, Expand);
2044 setOperationAction(ISD::FTANH, VT, Expand);
2045 setOperationAction(ISD::FPOW, VT, Expand);
2046 setOperationAction(ISD::FLOG, VT, Expand);
2047 setOperationAction(ISD::FLOG2, VT, Expand);
2048 setOperationAction(ISD::FLOG10, VT, Expand);
2049 setOperationAction(ISD::FEXP, VT, Expand);
2050 setOperationAction(ISD::FEXP2, VT, Expand);
2051 setOperationAction(ISD::FEXP10, VT, Expand);
2052 }
2053
2054 // But we do support custom-lowering for FCOPYSIGN.
2055 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2056 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2057 VT == MVT::v8f16) &&
2058 Subtarget->hasFullFP16()))
2060
2073
2077 for (MVT InnerVT : MVT::all_valuetypes())
2078 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2079
2080 // CNT supports only B element sizes, then use UADDLP to widen.
2081 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2083
2089
2090 for (unsigned Opcode :
2093 setOperationAction(Opcode, VT, Custom);
2094
2095 if (!VT.isFloatingPoint())
2097
2098 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2099 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2100 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2101 setOperationAction(Opcode, VT, Legal);
2102
2103 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2104 // NEON types.
2105 if (VT.isFloatingPoint() &&
2106 VT.getVectorElementType() != MVT::bf16 &&
2107 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2108 for (unsigned Opcode :
2109 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2110 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2114 setOperationAction(Opcode, VT, Legal);
2115
2116 // Strict fp extend and trunc are legal
2117 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2119 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2121
2122 // FIXME: We could potentially make use of the vector comparison instructions
2123 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2124 // complications:
2125 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2126 // so we would need to expand when the condition code doesn't match the
2127 // kind of comparison.
2128 // * Some kinds of comparison require more than one FCMXY instruction so
2129 // would need to be expanded instead.
2130 // * The lowering of the non-strict versions involves target-specific ISD
2131 // nodes so we would likely need to add strict versions of all of them and
2132 // handle them appropriately.
2135
2136 // When little-endian we can use ordinary d and q register loads/stores for
2137 // vector types, but when big-endian we need to use structure load/store which
2138 // only allow post-index addressing.
2139 if (Subtarget->isLittleEndian()) {
2140 for (unsigned im = (unsigned)ISD::PRE_INC;
2141 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2144 }
2145 } else {
2148 }
2149
2150 if (Subtarget->hasD128()) {
2153 }
2154
2155 if (VT.isInteger()) {
2156 // Let common code emit inverted variants of compares we do support.
2162 }
2163}
2164
2166 EVT OpVT) const {
2167 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2168 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2169 ResVT.getVectorElementType() != MVT::i1)
2170 return true;
2171
2172 // Only support illegal types if the result is scalable and min elements > 1.
2173 if (ResVT.getVectorMinNumElements() == 1 ||
2174 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2175 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2176 return true;
2177
2178 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2179 // but anything larger should be expanded.
2180 if (OpVT.getFixedSizeInBits() > 64)
2181 return true;
2182
2183 return false;
2184}
2185
2187 if (!Subtarget->isSVEorStreamingSVEAvailable())
2188 return true;
2189
2190 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2191 // also support fixed-width predicates.
2192 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2193 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2194 VT != MVT::v4i1 && VT != MVT::v2i1;
2195}
2196
2198 unsigned SearchSize) const {
2199 // MATCH is SVE2 and only available in non-streaming mode.
2200 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2201 return true;
2202 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2203 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2204 return SearchSize != 8;
2205 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2206 return SearchSize != 8 && SearchSize != 16;
2207 return true;
2208}
2209
2210void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2211 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2212
2213 // By default everything must be expanded.
2214 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2216
2217 if (VT.isFloatingPoint()) {
2227 }
2228
2230 VT == MVT::v1f64 ? Expand : Custom;
2231
2232 // Mark integer truncating stores/extending loads as having custom lowering
2233 if (VT.isInteger()) {
2234 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2235 while (InnerVT != VT) {
2236 setTruncStoreAction(VT, InnerVT, Default);
2237 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2238 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2239 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2240 InnerVT = InnerVT.changeVectorElementType(
2241 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2242 }
2243 }
2244
2245 // Mark floating-point truncating stores/extending loads as having custom
2246 // lowering
2247 if (VT.isFloatingPoint()) {
2248 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2249 while (InnerVT != VT) {
2250 setTruncStoreAction(VT, InnerVT, Custom);
2251 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2252 InnerVT = InnerVT.changeVectorElementType(
2254 }
2255 }
2256
2257 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2258 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2259
2260 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2261 ISD::PARTIAL_REDUCE_UMLA};
2262 unsigned NumElts = VT.getVectorNumElements();
2263 if (VT.getVectorElementType() == MVT::i64) {
2264 setPartialReduceMLAAction(MLAOps, VT,
2265 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2266 setPartialReduceMLAAction(MLAOps, VT,
2267 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2268 setPartialReduceMLAAction(MLAOps, VT,
2269 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2270 } else if (VT.getVectorElementType() == MVT::i32) {
2271 setPartialReduceMLAAction(MLAOps, VT,
2272 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2273 setPartialReduceMLAAction(MLAOps, VT,
2274 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2275 } else if (VT.getVectorElementType() == MVT::i16) {
2276 setPartialReduceMLAAction(MLAOps, VT,
2277 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2278 }
2279 if (Subtarget->hasMatMulInt8()) {
2280 if (VT.getVectorElementType() == MVT::i32)
2281 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2282 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2283 else if (VT.getVectorElementType() == MVT::i64)
2284 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2285 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2286 }
2287
2288 // Lower fixed length vector operations to scalable equivalents.
2295 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2305 setOperationAction(ISD::FABS, VT, Default);
2307 setOperationAction(ISD::FCEIL, VT, Default);
2310 setOperationAction(ISD::FFLOOR, VT, Default);
2312 setOperationAction(ISD::FMAXIMUM, VT, Default);
2313 setOperationAction(ISD::FMAXNUM, VT, Default);
2314 setOperationAction(ISD::FMINIMUM, VT, Default);
2315 setOperationAction(ISD::FMINNUM, VT, Default);
2317 setOperationAction(ISD::FNEARBYINT, VT, Default);
2318 setOperationAction(ISD::FNEG, VT, Default);
2319 setOperationAction(ISD::FP_EXTEND, VT, Default);
2323 setOperationAction(ISD::FRINT, VT, Default);
2324 setOperationAction(ISD::LRINT, VT, Default);
2325 setOperationAction(ISD::LLRINT, VT, Default);
2326 setOperationAction(ISD::FROUND, VT, Default);
2327 setOperationAction(ISD::FROUNDEVEN, VT, Default);
2328 setOperationAction(ISD::FSQRT, VT, Default);
2330 setOperationAction(ISD::FTRUNC, VT, Default);
2331 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);
2333 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2334 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2335 setOperationAction(ISD::MLOAD, VT, Default);
2336 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2337 setOperationAction(ISD::MSTORE, VT, Default);
2355 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2362 setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2363 setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2364 setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2365 setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2366 setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2367 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2368 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2369 setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2370 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2371 setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2372 setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2373 setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2374 setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2375 setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2381}
2382
2383void AArch64TargetLowering::addDRType(MVT VT) {
2384 addRegisterClass(VT, &AArch64::FPR64RegClass);
2385 if (Subtarget->isNeonAvailable())
2386 addTypeForNEON(VT);
2387}
2388
2389void AArch64TargetLowering::addQRType(MVT VT) {
2390 addRegisterClass(VT, &AArch64::FPR128RegClass);
2391 if (Subtarget->isNeonAvailable())
2392 addTypeForNEON(VT);
2393}
2394
2396 LLVMContext &C, EVT VT) const {
2397 if (!VT.isVector())
2398 return MVT::i32;
2399 if (VT.isScalableVector())
2400 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2402}
2403
2404// isIntImmediate - This method tests to see if the node is a constant
2405// operand. If so Imm will receive the value.
2406static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2408 Imm = C->getZExtValue();
2409 return true;
2410 }
2411 return false;
2412}
2413
2414bool isVectorizedBinOp(unsigned Opcode) {
2415 switch (Opcode) {
2416 case AArch64ISD::SQDMULH:
2417 return true;
2418 default:
2419 return false;
2420 }
2421}
2422
2423// isOpcWithIntImmediate - This method tests to see if the node is a specific
2424// opcode and that it has a immediate integer right operand.
2425// If so Imm will receive the value.
2426static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2427 uint64_t &Imm) {
2428 return N->getOpcode() == Opc &&
2429 isIntImmediate(N->getOperand(1).getNode(), Imm);
2430}
2431
2432static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2433 const APInt &Demanded,
2435 unsigned NewOpc) {
2436 uint64_t OldImm = Imm, NewImm, Enc;
2437 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2438
2439 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2440 // bimm64.
2441 if (Imm == 0 || Imm == Mask ||
2443 return false;
2444
2445 unsigned EltSize = Size;
2446 uint64_t DemandedBits = Demanded.getZExtValue();
2447
2448 // Clear bits that are not demanded.
2449 Imm &= DemandedBits;
2450
2451 while (true) {
2452 // The goal here is to set the non-demanded bits in a way that minimizes
2453 // the number of switching between 0 and 1. In order to achieve this goal,
2454 // we set the non-demanded bits to the value of the preceding demanded bits.
2455 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2456 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2457 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2458 // The final result is 0b11000011.
2459 uint64_t NonDemandedBits = ~DemandedBits;
2460 uint64_t InvertedImm = ~Imm & DemandedBits;
2461 uint64_t RotatedImm =
2462 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2463 NonDemandedBits;
2464 uint64_t Sum = RotatedImm + NonDemandedBits;
2465 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2466 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2467 NewImm = (Imm | Ones) & Mask;
2468
2469 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2470 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2471 // we halve the element size and continue the search.
2472 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2473 break;
2474
2475 // We cannot shrink the element size any further if it is 2-bits.
2476 if (EltSize == 2)
2477 return false;
2478
2479 EltSize /= 2;
2480 Mask >>= EltSize;
2481 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2482
2483 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2484 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2485 return false;
2486
2487 // Merge the upper and lower halves of Imm and DemandedBits.
2488 Imm |= Hi;
2489 DemandedBits |= DemandedBitsHi;
2490 }
2491
2492 ++NumOptimizedImms;
2493
2494 // Replicate the element across the register width.
2495 while (EltSize < Size) {
2496 NewImm |= NewImm << EltSize;
2497 EltSize *= 2;
2498 }
2499
2500 (void)OldImm;
2501 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2502 "demanded bits should never be altered");
2503 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2504
2505 // Create the new constant immediate node.
2506 EVT VT = Op.getValueType();
2507 SDLoc DL(Op);
2508 SDValue New;
2509
2510 // If the new constant immediate is all-zeros or all-ones, let the target
2511 // independent DAG combine optimize this node.
2512 if (NewImm == 0 || NewImm == OrigMask) {
2513 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2514 TLO.DAG.getConstant(NewImm, DL, VT));
2515 // Otherwise, create a machine node so that target independent DAG combine
2516 // doesn't undo this optimization.
2517 } else {
2519 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2520 New = SDValue(
2521 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2522 }
2523
2524 return TLO.CombineTo(Op, New);
2525}
2526
2528 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2529 TargetLoweringOpt &TLO) const {
2530 // Delay this optimization to as late as possible.
2531 if (!TLO.LegalOps)
2532 return false;
2533
2535 return false;
2536
2537 EVT VT = Op.getValueType();
2538 if (VT.isVector())
2539 return false;
2540
2541 unsigned Size = VT.getSizeInBits();
2542
2543 if (Size != 32 && Size != 64)
2544 return false;
2545
2546 // Exit early if we demand all bits.
2547 if (DemandedBits.popcount() == Size)
2548 return false;
2549
2550 unsigned NewOpc;
2551 switch (Op.getOpcode()) {
2552 default:
2553 return false;
2554 case ISD::AND:
2555 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2556 break;
2557 case ISD::OR:
2558 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2559 break;
2560 case ISD::XOR:
2561 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2562 break;
2563 }
2564 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2565 if (!C)
2566 return false;
2567 uint64_t Imm = C->getZExtValue();
2568 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2569}
2570
2571/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2572/// Mask are known to be either zero or one and return them Known.
2574 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2575 const SelectionDAG &DAG, unsigned Depth) const {
2576 switch (Op.getOpcode()) {
2577 default:
2578 break;
2579 case AArch64ISD::DUP: {
2580 SDValue SrcOp = Op.getOperand(0);
2581 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2582 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2583 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2584 "Expected DUP implicit truncation");
2585 Known = Known.trunc(Op.getScalarValueSizeInBits());
2586 }
2587 break;
2588 }
2589 case AArch64ISD::CSEL: {
2590 KnownBits Known2;
2591 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2592 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2593 Known = Known.intersectWith(Known2);
2594 break;
2595 }
2596 case AArch64ISD::CSNEG:
2597 case AArch64ISD::CSINC:
2598 case AArch64ISD::CSINV: {
2599 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2600 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2601
2602 // The result is either:
2603 // CSINC: KnownOp0 or KnownOp1 + 1
2604 // CSINV: KnownOp0 or ~KnownOp1
2605 // CSNEG: KnownOp0 or KnownOp1 * -1
2606 if (Op.getOpcode() == AArch64ISD::CSINC)
2607 KnownOp1 = KnownBits::add(
2608 KnownOp1,
2609 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2610 else if (Op.getOpcode() == AArch64ISD::CSINV)
2611 std::swap(KnownOp1.Zero, KnownOp1.One);
2612 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2613 KnownOp1 =
2615 Op.getScalarValueSizeInBits())));
2616
2617 Known = KnownOp0.intersectWith(KnownOp1);
2618 break;
2619 }
2620 case AArch64ISD::BICi: {
2621 // Compute the bit cleared value.
2622 APInt Mask =
2623 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2624 .trunc(Known.getBitWidth());
2625 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2626 Known &= KnownBits::makeConstant(Mask);
2627 break;
2628 }
2629 case AArch64ISD::VLSHR: {
2630 KnownBits Known2;
2631 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2632 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2633 Known = KnownBits::lshr(Known, Known2);
2634 break;
2635 }
2636 case AArch64ISD::VASHR: {
2637 KnownBits Known2;
2638 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2639 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2640 Known = KnownBits::ashr(Known, Known2);
2641 break;
2642 }
2643 case AArch64ISD::VSHL: {
2644 KnownBits Known2;
2645 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2646 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2647 Known = KnownBits::shl(Known, Known2);
2648 break;
2649 }
2650 case AArch64ISD::MOVI: {
2652 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2653 break;
2654 }
2655 case AArch64ISD::MOVIshift: {
2657 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2658 << Op->getConstantOperandVal(1)));
2659 break;
2660 }
2661 case AArch64ISD::MOVImsl: {
2662 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2664 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2665 break;
2666 }
2667 case AArch64ISD::MOVIedit: {
2669 Known.getBitWidth(),
2670 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2671 break;
2672 }
2673 case AArch64ISD::MVNIshift: {
2675 APInt(Known.getBitWidth(),
2676 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2677 /*isSigned*/ false, /*implicitTrunc*/ true));
2678 break;
2679 }
2680 case AArch64ISD::MVNImsl: {
2681 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2683 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2684 /*isSigned*/ false, /*implicitTrunc*/ true));
2685 break;
2686 }
2687 case AArch64ISD::LOADgot:
2688 case AArch64ISD::ADDlow: {
2689 if (!Subtarget->isTargetILP32())
2690 break;
2691 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2692 Known.Zero = APInt::getHighBitsSet(64, 32);
2693 break;
2694 }
2695 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2696 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2697 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2698 break;
2699 }
2701 Intrinsic::ID IntID =
2702 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2703 switch (IntID) {
2704 default: return;
2705 case Intrinsic::aarch64_ldaxr:
2706 case Intrinsic::aarch64_ldxr: {
2707 unsigned BitWidth = Known.getBitWidth();
2708 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2709 unsigned MemBits = VT.getScalarSizeInBits();
2710 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2711 return;
2712 }
2713 }
2714 break;
2715 }
2717 case ISD::INTRINSIC_VOID: {
2718 unsigned IntNo = Op.getConstantOperandVal(0);
2719 switch (IntNo) {
2720 default:
2721 break;
2722 case Intrinsic::aarch64_neon_uaddlv: {
2723 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2724 unsigned BitWidth = Known.getBitWidth();
2725 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2726 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2727 assert(BitWidth >= Bound && "Unexpected width!");
2729 Known.Zero |= Mask;
2730 }
2731 break;
2732 }
2733 case Intrinsic::aarch64_neon_umaxv:
2734 case Intrinsic::aarch64_neon_uminv: {
2735 // Figure out the datatype of the vector operand. The UMINV instruction
2736 // will zero extend the result, so we can mark as known zero all the
2737 // bits larger than the element datatype. 32-bit or larget doesn't need
2738 // this as those are legal types and will be handled by isel directly.
2739 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2740 unsigned BitWidth = Known.getBitWidth();
2741 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2742 assert(BitWidth >= 8 && "Unexpected width!");
2744 Known.Zero |= Mask;
2745 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2746 assert(BitWidth >= 16 && "Unexpected width!");
2748 Known.Zero |= Mask;
2749 }
2750 break;
2751 } break;
2752 }
2753 }
2754 }
2755}
2756
2758 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2759 unsigned Depth) const {
2760 EVT VT = Op.getValueType();
2761 unsigned VTBits = VT.getScalarSizeInBits();
2762 unsigned Opcode = Op.getOpcode();
2763 switch (Opcode) {
2764 case AArch64ISD::FCMEQ:
2765 case AArch64ISD::FCMGE:
2766 case AArch64ISD::FCMGT:
2767 // Compares return either 0 or all-ones
2768 return VTBits;
2769 case AArch64ISD::VASHR: {
2770 unsigned Tmp =
2771 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2772 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2773 }
2774 }
2775
2776 return 1;
2777}
2778
2780 EVT) const {
2781 return MVT::i64;
2782}
2783
2785 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2786 unsigned *Fast) const {
2787
2788 // Allow SVE loads/stores where the alignment >= the size of the element type,
2789 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2790 // for stores that come from IR, only require element-size alignment (even if
2791 // unaligned accesses are disabled). Without this, these will be forced to
2792 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2793 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2794 if (VT.isScalableVector()) {
2795 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2796 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2797 return true;
2798 }
2799
2800 if (Subtarget->requiresStrictAlign())
2801 return false;
2802
2803 if (Fast) {
2804 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2805 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2806 // See comments in performSTORECombine() for more details about
2807 // these conditions.
2808
2809 // Code that uses clang vector extensions can mark that it
2810 // wants unaligned accesses to be treated as fast by
2811 // underspecifying alignment to be 1 or 2.
2812 Alignment <= 2 ||
2813
2814 // Disregard v2i64. Memcpy lowering produces those and splitting
2815 // them regresses performance on micro-benchmarks and olden/bh.
2816 VT == MVT::v2i64;
2817 }
2818 return true;
2819}
2820
2821// Same as above but handling LLTs instead.
2823 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2824 unsigned *Fast) const {
2825 if (Subtarget->requiresStrictAlign())
2826 return false;
2827
2828 if (Fast) {
2829 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2830 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2831 Ty.getSizeInBytes() != 16 ||
2832 // See comments in performSTORECombine() for more details about
2833 // these conditions.
2834
2835 // Code that uses clang vector extensions can mark that it
2836 // wants unaligned accesses to be treated as fast by
2837 // underspecifying alignment to be 1 or 2.
2838 Alignment <= 2 ||
2839
2840 // Disregard v2i64. Memcpy lowering produces those and splitting
2841 // them regresses performance on micro-benchmarks and olden/bh.
2842 Ty == LLT::fixed_vector(2, 64);
2843 }
2844 return true;
2845}
2846
2847FastISel *
2849 const TargetLibraryInfo *libInfo) const {
2850 return AArch64::createFastISel(funcInfo, libInfo);
2851}
2852
2855 MachineBasicBlock *MBB) const {
2856 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2857 // phi node:
2858
2859 // OrigBB:
2860 // [... previous instrs leading to comparison ...]
2861 // b.ne TrueBB
2862 // b EndBB
2863 // TrueBB:
2864 // ; Fallthrough
2865 // EndBB:
2866 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2867
2868 MachineFunction *MF = MBB->getParent();
2869 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2870 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2871 DebugLoc DL = MI.getDebugLoc();
2872 MachineFunction::iterator It = ++MBB->getIterator();
2873
2874 Register DestReg = MI.getOperand(0).getReg();
2875 Register IfTrueReg = MI.getOperand(1).getReg();
2876 Register IfFalseReg = MI.getOperand(2).getReg();
2877 unsigned CondCode = MI.getOperand(3).getImm();
2878 bool NZCVKilled = MI.getOperand(4).isKill();
2879
2880 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2881 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2882 MF->insert(It, TrueBB);
2883 MF->insert(It, EndBB);
2884
2885 // Transfer rest of current basic-block to EndBB
2886 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2887 MBB->end());
2889
2890 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2891 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2892 MBB->addSuccessor(TrueBB);
2893 MBB->addSuccessor(EndBB);
2894
2895 // TrueBB falls through to the end.
2896 TrueBB->addSuccessor(EndBB);
2897
2898 if (!NZCVKilled) {
2899 TrueBB->addLiveIn(AArch64::NZCV);
2900 EndBB->addLiveIn(AArch64::NZCV);
2901 }
2902
2903 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2904 .addReg(IfTrueReg)
2905 .addMBB(TrueBB)
2906 .addReg(IfFalseReg)
2907 .addMBB(MBB);
2908
2909 MI.eraseFromParent();
2910 return EndBB;
2911}
2912
2920
2923 MachineBasicBlock *MBB) const {
2924 MachineFunction &MF = *MBB->getParent();
2925 MachineBasicBlock::iterator MBBI = MI.getIterator();
2926 const AArch64InstrInfo &TII =
2927 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2928 Register TargetReg = MI.getOperand(0).getReg();
2930 TII.probedStackAlloc(MBBI, TargetReg, false);
2931
2932 MI.eraseFromParent();
2933 return NextInst->getParent();
2934}
2935
2938 MachineBasicBlock *MBB) const {
2939 MachineFunction *MF = MBB->getParent();
2941
2942 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
2943 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
2944
2945 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
2946 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
2947 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
2948 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
2949
2950 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2951 DebugLoc DL = MI.getDebugLoc();
2952
2953 // RDVL requires GPR64, ADDSVL requires GPR64sp
2954 // We need to insert COPY instructions, these will later be removed by the
2955 // RegisterCoalescer
2956 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
2957 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
2958 .addReg(RegVL_GPR);
2959
2960 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
2961 .addReg(RegVL_GPRsp)
2962 .addImm(-1);
2963 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
2964 .addReg(RegSVL_GPRsp);
2965
2966 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2967 MachineFunction::iterator It = ++MBB->getIterator();
2968 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
2969 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
2970 MF->insert(It, TrapBB);
2971 MF->insert(It, PassBB);
2972
2973 // Continue if vector lengths match
2974 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
2975 .addReg(RegSVL_GPR)
2976 .addMBB(PassBB);
2977
2978 // Transfer rest of current BB to PassBB
2979 PassBB->splice(PassBB->begin(), MBB,
2980 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2982
2983 // Trap if vector lengths mismatch
2984 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
2985
2986 MBB->addSuccessor(TrapBB);
2987 MBB->addSuccessor(PassBB);
2988
2989 MI.eraseFromParent();
2990 return PassBB;
2991}
2992
2994AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2996 MachineBasicBlock *BB) const {
2997 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2998 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2999
3000 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3001 MIB.add(MI.getOperand(1)); // slice index register
3002 MIB.add(MI.getOperand(2)); // slice index offset
3003 MIB.add(MI.getOperand(3)); // pg
3004 MIB.add(MI.getOperand(4)); // base
3005 MIB.add(MI.getOperand(5)); // offset
3006
3007 MI.eraseFromParent(); // The pseudo is gone now.
3008 return BB;
3009}
3010
3013 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3015 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3016
3017 MIB.addReg(AArch64::ZA, RegState::Define);
3018 MIB.add(MI.getOperand(0)); // Vector select register
3019 MIB.add(MI.getOperand(1)); // Vector select offset
3020 MIB.add(MI.getOperand(2)); // Base
3021 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3022
3023 MI.eraseFromParent(); // The pseudo is gone now.
3024 return BB;
3025}
3026
3029 unsigned Opcode,
3030 bool Op0IsDef) const {
3031 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3033
3034 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3035 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3036 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3037 MIB.add(MI.getOperand(I));
3038
3039 MI.eraseFromParent(); // The pseudo is gone now.
3040 return BB;
3041}
3042
3044AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3046 MachineBasicBlock *BB) const {
3047 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3048 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3049 unsigned StartIdx = 0;
3050
3051 bool HasTile = BaseReg != AArch64::ZA;
3052 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3053 if (HasZPROut) {
3054 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3055 ++StartIdx;
3056 }
3057 if (HasTile) {
3058 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3059 RegState::Define); // Output ZA Tile
3060 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3061 StartIdx++;
3062 } else {
3063 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3064 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3065 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3066 ++StartIdx;
3067 }
3068 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3069 }
3070 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3071 MIB.add(MI.getOperand(I));
3072
3073 MI.eraseFromParent(); // The pseudo is gone now.
3074 return BB;
3075}
3076
3079 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3081 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3082 MIB.add(MI.getOperand(0)); // Mask
3083
3084 unsigned Mask = MI.getOperand(0).getImm();
3085 for (unsigned I = 0; I < 8; I++) {
3086 if (Mask & (1 << I))
3087 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3088 }
3089
3090 MI.eraseFromParent(); // The pseudo is gone now.
3091 return BB;
3092}
3093
3096 MachineBasicBlock *BB) const {
3097 MachineFunction *MF = BB->getParent();
3098 MachineFrameInfo &MFI = MF->getFrameInfo();
3100 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3101 if (TPIDR2.Uses > 0) {
3102 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3103 // generally don't support big-endian SVE/SME.
3104 if (!Subtarget->isLittleEndian())
3106 "TPIDR2 block initialization is not supported on big-endian targets");
3107
3108 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3109 // Store buffer pointer and num_za_save_slices.
3110 // Bytes 10-15 are implicitly zeroed.
3111 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3112 .addReg(MI.getOperand(0).getReg())
3113 .addReg(MI.getOperand(1).getReg())
3114 .addFrameIndex(TPIDR2.FrameIndex)
3115 .addImm(0);
3116 } else
3117 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3118
3119 BB->remove_instr(&MI);
3120 return BB;
3121}
3122
3125 MachineBasicBlock *BB) const {
3126 MachineFunction *MF = BB->getParent();
3127 MachineFrameInfo &MFI = MF->getFrameInfo();
3129 // TODO This function grows the stack with a subtraction, which doesn't work
3130 // on Windows. Some refactoring to share the functionality in
3131 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3132 // supports SME
3134 "Lazy ZA save is not yet supported on Windows");
3135
3136 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3137
3138 if (TPIDR2.Uses > 0) {
3139 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3141
3142 // The SUBXrs below won't always be emitted in a form that accepts SP
3143 // directly
3144 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3145 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3146 .addReg(AArch64::SP);
3147
3148 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3149 auto Size = MI.getOperand(1).getReg();
3150 auto Dest = MI.getOperand(0).getReg();
3151 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3152 .addReg(Size)
3153 .addReg(Size)
3154 .addReg(SP);
3155 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3156 AArch64::SP)
3157 .addReg(Dest);
3158
3159 // We have just allocated a variable sized object, tell this to PEI.
3160 MFI.CreateVariableSizedObject(Align(16), nullptr);
3161 }
3162
3163 BB->remove_instr(&MI);
3164 return BB;
3165}
3166
3167// TODO: Find a way to merge this with EmitAllocateZABuffer.
3170 MachineBasicBlock *BB) const {
3171 MachineFunction *MF = BB->getParent();
3172 MachineFrameInfo &MFI = MF->getFrameInfo();
3175 "Lazy ZA save is not yet supported on Windows");
3176
3177 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3178 if (FuncInfo->isSMESaveBufferUsed()) {
3179 // Allocate a buffer object of the size given by MI.getOperand(1).
3180 auto Size = MI.getOperand(1).getReg();
3181 auto Dest = MI.getOperand(0).getReg();
3182 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3183 .addReg(AArch64::SP)
3184 .addReg(Size)
3186 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3187 .addReg(AArch64::SP);
3188
3189 // We have just allocated a variable sized object, tell this to PEI.
3190 MFI.CreateVariableSizedObject(Align(16), nullptr);
3191 } else
3192 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3193 MI.getOperand(0).getReg());
3194
3195 BB->remove_instr(&MI);
3196 return BB;
3197}
3198
3201 MachineBasicBlock *BB) const {
3202 // If the buffer is used, emit a call to __arm_sme_state_size()
3203 MachineFunction *MF = BB->getParent();
3205 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3206 if (FuncInfo->isSMESaveBufferUsed()) {
3207 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3208 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3209 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3211 .addReg(AArch64::X0, RegState::ImplicitDefine)
3212 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3213 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3214 MI.getOperand(0).getReg())
3215 .addReg(AArch64::X0);
3216 } else
3217 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3218 MI.getOperand(0).getReg())
3219 .addReg(AArch64::XZR);
3220 BB->remove_instr(&MI);
3221 return BB;
3222}
3223
3226 MachineBasicBlock *BB) const {
3227 MachineFunction *MF = BB->getParent();
3228 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3229 const DebugLoc &DL = MI.getDebugLoc();
3230 Register ResultReg = MI.getOperand(0).getReg();
3231 if (MF->getRegInfo().use_empty(ResultReg)) {
3232 // Nothing to do. Pseudo erased below.
3233 } else if (Subtarget->hasSME()) {
3234 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3235 .addImm(AArch64SysReg::SVCR)
3236 .addReg(AArch64::VG, RegState::Implicit);
3237 } else {
3238 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3239 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3240 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3242 .addReg(AArch64::X0, RegState::ImplicitDefine)
3243 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3244 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3245 .addReg(AArch64::X0);
3246 }
3247 MI.eraseFromParent();
3248 return BB;
3249}
3250
3251// Helper function to find the instruction that defined a virtual register.
3252// If unable to find such instruction, returns nullptr.
3254 Register Reg) {
3255 while (Reg.isVirtual()) {
3256 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3257 assert(DefMI && "Virtual register definition not found");
3258 unsigned Opcode = DefMI->getOpcode();
3259
3260 if (Opcode == AArch64::COPY) {
3261 Reg = DefMI->getOperand(1).getReg();
3262 // Vreg is defined by copying from physreg.
3263 if (Reg.isPhysical())
3264 return DefMI;
3265 continue;
3266 }
3267 if (Opcode == AArch64::SUBREG_TO_REG) {
3268 Reg = DefMI->getOperand(2).getReg();
3269 continue;
3270 }
3271
3272 return DefMI;
3273 }
3274 return nullptr;
3275}
3276
3279 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3280 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3281 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3282 const DebugLoc &DL = MI.getDebugLoc();
3283
3284 Register AddrDisc = AddrDiscOp.getReg();
3285 int64_t IntDisc = IntDiscOp.getImm();
3286 assert(IntDisc == 0 && "Blend components are already expanded");
3287
3288 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3289 if (DiscMI) {
3290 switch (DiscMI->getOpcode()) {
3291 case AArch64::MOVKXi:
3292 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3293 // #imm should be an immediate and not a global symbol, for example.
3294 if (DiscMI->getOperand(2).isImm() &&
3295 DiscMI->getOperand(3).getImm() == 48) {
3296 AddrDisc = DiscMI->getOperand(1).getReg();
3297 IntDisc = DiscMI->getOperand(2).getImm();
3298 }
3299 break;
3300 case AArch64::MOVi32imm:
3301 case AArch64::MOVi64imm:
3302 // Small immediate integer constant passed via VReg.
3303 if (DiscMI->getOperand(1).isImm() &&
3304 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3305 AddrDisc = AArch64::NoRegister;
3306 IntDisc = DiscMI->getOperand(1).getImm();
3307 }
3308 break;
3309 }
3310 }
3311
3312 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3313 // in the requested register class.
3314 if (AddrDisc == AArch64::XZR)
3315 AddrDisc = AArch64::NoRegister;
3316
3317 // Make sure AddrDisc operand respects the register class imposed by MI.
3318 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3319 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3320 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3321 AddrDisc = TmpReg;
3322 }
3323
3324 AddrDiscOp.setReg(AddrDisc);
3325 IntDiscOp.setImm(IntDisc);
3326}
3327
3329 MachineInstr &MI, MachineBasicBlock *BB) const {
3330
3331 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3332 if (SMEOrigInstr != -1) {
3333 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3334 uint64_t SMEMatrixType =
3335 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3336 switch (SMEMatrixType) {
3338 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3340 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3342 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3344 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3346 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3348 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3349 }
3350 }
3351
3352 switch (MI.getOpcode()) {
3353 default:
3354#ifndef NDEBUG
3355 MI.dump();
3356#endif
3357 llvm_unreachable("Unexpected instruction for custom inserter!");
3358 case AArch64::InitTPIDR2Obj:
3359 return EmitInitTPIDR2Object(MI, BB);
3360 case AArch64::AllocateZABuffer:
3361 return EmitAllocateZABuffer(MI, BB);
3362 case AArch64::AllocateSMESaveBuffer:
3363 return EmitAllocateSMESaveBuffer(MI, BB);
3364 case AArch64::GetSMESaveSize:
3365 return EmitGetSMESaveSize(MI, BB);
3366 case AArch64::EntryPStateSM:
3367 return EmitEntryPStateSM(MI, BB);
3368 case AArch64::F128CSEL:
3369 return EmitF128CSEL(MI, BB);
3370 case TargetOpcode::STATEPOINT:
3371 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3372 // while bl call instruction (where statepoint will be lowered at the end)
3373 // has implicit def. This def is early-clobber as it will be set at
3374 // the moment of the call and earlier than any use is read.
3375 // Add this implicit dead def here as a workaround.
3376 MI.addOperand(*MI.getMF(),
3378 AArch64::LR, /*isDef*/ true,
3379 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3380 /*isUndef*/ false, /*isEarlyClobber*/ true));
3381 [[fallthrough]];
3382 case TargetOpcode::STACKMAP:
3383 case TargetOpcode::PATCHPOINT:
3384 return emitPatchPoint(MI, BB);
3385
3386 case TargetOpcode::PATCHABLE_EVENT_CALL:
3387 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3388 return BB;
3389
3390 case AArch64::CATCHRET:
3391 return EmitLoweredCatchRet(MI, BB);
3392
3393 case AArch64::PROBED_STACKALLOC_DYN:
3394 return EmitDynamicProbedAlloc(MI, BB);
3395
3396 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3397 return EmitCheckMatchingVL(MI, BB);
3398
3399 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3400 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3401 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3402 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3403 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3404 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3405 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3406 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3407 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3408 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3409 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3410 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3411 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3412 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3413 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3414 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3415 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3416 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3417 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3418 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3419 case AArch64::LDR_ZA_PSEUDO:
3420 return EmitFill(MI, BB);
3421 case AArch64::LDR_TX_PSEUDO:
3422 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3423 case AArch64::STR_TX_PSEUDO:
3424 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3425 case AArch64::ZERO_M_PSEUDO:
3426 return EmitZero(MI, BB);
3427 case AArch64::ZERO_T_PSEUDO:
3428 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3429 case AArch64::MOVT_TIZ_PSEUDO:
3430 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3431
3432 case AArch64::PAC:
3433 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3434 &AArch64::GPR64noipRegClass);
3435 return BB;
3436 }
3437}
3438
3439//===----------------------------------------------------------------------===//
3440// AArch64 Lowering private implementation.
3441//===----------------------------------------------------------------------===//
3442
3443//===----------------------------------------------------------------------===//
3444// Lowering Code
3445//===----------------------------------------------------------------------===//
3446
3447// Forward declarations of SVE fixed length lowering helpers
3452 SelectionDAG &DAG);
3455 EVT VT);
3456
3457/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3458static bool isZerosVector(const SDNode *N) {
3459 // Look through a bit convert.
3460 while (N->getOpcode() == ISD::BITCAST)
3461 N = N->getOperand(0).getNode();
3462
3464 return true;
3465
3466 if (N->getOpcode() != AArch64ISD::DUP)
3467 return false;
3468
3469 auto Opnd0 = N->getOperand(0);
3470 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3471}
3472
3473/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3474/// CC
3476 SDValue RHS = {}) {
3477 switch (CC) {
3478 default:
3479 llvm_unreachable("Unknown condition code!");
3480 case ISD::SETNE:
3481 return AArch64CC::NE;
3482 case ISD::SETEQ:
3483 return AArch64CC::EQ;
3484 case ISD::SETGT:
3485 return AArch64CC::GT;
3486 case ISD::SETGE:
3488 case ISD::SETLT:
3490 case ISD::SETLE:
3491 return AArch64CC::LE;
3492 case ISD::SETUGT:
3493 return AArch64CC::HI;
3494 case ISD::SETUGE:
3495 return AArch64CC::HS;
3496 case ISD::SETULT:
3497 return AArch64CC::LO;
3498 case ISD::SETULE:
3499 return AArch64CC::LS;
3500 }
3501}
3502
3503/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3505 AArch64CC::CondCode &CondCode,
3506 AArch64CC::CondCode &CondCode2) {
3507 CondCode2 = AArch64CC::AL;
3508 switch (CC) {
3509 default:
3510 llvm_unreachable("Unknown FP condition!");
3511 case ISD::SETEQ:
3512 case ISD::SETOEQ:
3513 CondCode = AArch64CC::EQ;
3514 break;
3515 case ISD::SETGT:
3516 case ISD::SETOGT:
3517 CondCode = AArch64CC::GT;
3518 break;
3519 case ISD::SETGE:
3520 case ISD::SETOGE:
3521 CondCode = AArch64CC::GE;
3522 break;
3523 case ISD::SETOLT:
3524 CondCode = AArch64CC::MI;
3525 break;
3526 case ISD::SETOLE:
3527 CondCode = AArch64CC::LS;
3528 break;
3529 case ISD::SETONE:
3530 CondCode = AArch64CC::MI;
3531 CondCode2 = AArch64CC::GT;
3532 break;
3533 case ISD::SETO:
3534 CondCode = AArch64CC::VC;
3535 break;
3536 case ISD::SETUO:
3537 CondCode = AArch64CC::VS;
3538 break;
3539 case ISD::SETUEQ:
3540 CondCode = AArch64CC::EQ;
3541 CondCode2 = AArch64CC::VS;
3542 break;
3543 case ISD::SETUGT:
3544 CondCode = AArch64CC::HI;
3545 break;
3546 case ISD::SETUGE:
3547 CondCode = AArch64CC::PL;
3548 break;
3549 case ISD::SETLT:
3550 case ISD::SETULT:
3551 CondCode = AArch64CC::LT;
3552 break;
3553 case ISD::SETLE:
3554 case ISD::SETULE:
3555 CondCode = AArch64CC::LE;
3556 break;
3557 case ISD::SETNE:
3558 case ISD::SETUNE:
3559 CondCode = AArch64CC::NE;
3560 break;
3561 }
3562}
3563
3564/// Convert a DAG fp condition code to an AArch64 CC.
3565/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3566/// should be AND'ed instead of OR'ed.
3568 AArch64CC::CondCode &CondCode,
3569 AArch64CC::CondCode &CondCode2) {
3570 CondCode2 = AArch64CC::AL;
3571 switch (CC) {
3572 default:
3573 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3574 assert(CondCode2 == AArch64CC::AL);
3575 break;
3576 case ISD::SETONE:
3577 // (a one b)
3578 // == ((a olt b) || (a ogt b))
3579 // == ((a ord b) && (a une b))
3580 CondCode = AArch64CC::VC;
3581 CondCode2 = AArch64CC::NE;
3582 break;
3583 case ISD::SETUEQ:
3584 // (a ueq b)
3585 // == ((a uno b) || (a oeq b))
3586 // == ((a ule b) && (a uge b))
3587 CondCode = AArch64CC::PL;
3588 CondCode2 = AArch64CC::LE;
3589 break;
3590 }
3591}
3592
3593/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3594/// CC usable with the vector instructions. Fewer operations are available
3595/// without a real NZCV register, so we have to use less efficient combinations
3596/// to get the same effect.
3598 AArch64CC::CondCode &CondCode,
3599 AArch64CC::CondCode &CondCode2,
3600 bool &Invert) {
3601 Invert = false;
3602 switch (CC) {
3603 default:
3604 // Mostly the scalar mappings work fine.
3605 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3606 break;
3607 case ISD::SETUO:
3608 Invert = true;
3609 [[fallthrough]];
3610 case ISD::SETO:
3611 CondCode = AArch64CC::MI;
3612 CondCode2 = AArch64CC::GE;
3613 break;
3614 case ISD::SETUEQ:
3615 case ISD::SETULT:
3616 case ISD::SETULE:
3617 case ISD::SETUGT:
3618 case ISD::SETUGE:
3619 // All of the compare-mask comparisons are ordered, but we can switch
3620 // between the two by a double inversion. E.g. ULE == !OGT.
3621 Invert = true;
3622 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3623 CondCode, CondCode2);
3624 break;
3625 }
3626}
3627
3628/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3630 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3631 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3632}
3633
3635 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3636 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3637 LLVM_DEBUG(dbgs() << "Is imm " << C
3638 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3639 return IsLegal;
3640}
3641
3643 // Works for negative immediates too, as it can be written as an ADDS
3644 // instruction with a negated immediate.
3645 return isLegalArithImmed(C.abs().getZExtValue());
3646}
3647
3649 uint64_t Imm = C.getZExtValue();
3651 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3652 return Insn.size();
3653}
3654
3656 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3657 if (Op->getFlags().hasNoSignedWrap())
3658 return true;
3659
3660 // We can still figure out if the second operand is safe to use
3661 // in a CMN instruction by checking if it is known to be not the minimum
3662 // signed value. If it is not, then we can safely use CMN.
3663 // Note: We can eventually remove this check and simply rely on
3664 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3665 // consistently sets them appropriately when making said nodes.
3666
3667 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3668 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3669}
3670
3671// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3672// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3673// can be set differently by this operation. It comes down to whether
3674// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3675// everything is fine. If not then the optimization is wrong. Thus general
3676// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3677//
3678// So, finally, the only LLVM-native comparisons that don't mention C or V
3679// are the ones that aren't unsigned comparisons. They're the only ones we can
3680// safely use CMN for in the absence of information about op2.
3682 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3683 (isIntEqualitySetCC(CC) ||
3684 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3685 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3686}
3687
3689 SelectionDAG &DAG, SDValue Chain,
3690 bool IsSignaling) {
3691 EVT VT = LHS.getValueType();
3692 assert(VT != MVT::f128);
3693
3694 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3695
3696 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3697 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3698 {Chain, LHS});
3699 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3700 {LHS.getValue(1), RHS});
3701 Chain = RHS.getValue(1);
3702 }
3703 unsigned Opcode =
3704 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3705 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3706}
3707
3709 const SDLoc &DL, SelectionDAG &DAG) {
3710 EVT VT = LHS.getValueType();
3711 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3712
3713 if (VT.isFloatingPoint()) {
3714 assert(VT != MVT::f128);
3715 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3716 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3717 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3718 }
3719 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3720 }
3721
3722 // The CMP instruction is just an alias for SUBS, and representing it as
3723 // SUBS means that it's possible to get CSE with subtract operations.
3724 // A later phase can perform the optimization of setting the destination
3725 // register to WZR/XZR if it ends up being unused.
3726 unsigned Opcode = AArch64ISD::SUBS;
3727
3728 if (isCMN(RHS, CC, DAG)) {
3729 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3730 Opcode = AArch64ISD::ADDS;
3731 RHS = RHS.getOperand(1);
3732 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3733 isIntEqualitySetCC(CC)) {
3734 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3735 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3736 Opcode = AArch64ISD::ADDS;
3737 LHS = LHS.getOperand(1);
3738 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3739 if (LHS.getOpcode() == ISD::AND) {
3740 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3741 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3742 // of the signed comparisons.
3743 const SDValue ANDSNode =
3744 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3745 LHS.getOperand(0), LHS.getOperand(1));
3746 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3747 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3748 return ANDSNode.getValue(1);
3749 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3750 // Use result of ANDS
3751 return LHS.getValue(1);
3752 }
3753 }
3754
3755 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3756 .getValue(1);
3757}
3758
3759/// \defgroup AArch64CCMP CMP;CCMP matching
3760///
3761/// These functions deal with the formation of CMP;CCMP;... sequences.
3762/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3763/// a comparison. They set the NZCV flags to a predefined value if their
3764/// predicate is false. This allows to express arbitrary conjunctions, for
3765/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3766/// expressed as:
3767/// cmp A
3768/// ccmp B, inv(CB), CA
3769/// check for CB flags
3770///
3771/// This naturally lets us implement chains of AND operations with SETCC
3772/// operands. And we can even implement some other situations by transforming
3773/// them:
3774/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3775/// negating the flags used in a CCMP/FCCMP operations.
3776/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3777/// by negating the flags we test for afterwards. i.e.
3778/// NEG (CMP CCMP CCCMP ...) can be implemented.
3779/// - Note that we can only ever negate all previously processed results.
3780/// What we can not implement by flipping the flags to test is a negation
3781/// of two sub-trees (because the negation affects all sub-trees emitted so
3782/// far, so the 2nd sub-tree we emit would also affect the first).
3783/// With those tools we can implement some OR operations:
3784/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3785/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3786/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3787/// elimination rules from earlier to implement the whole thing as a
3788/// CCMP/FCCMP chain.
3789///
3790/// As complete example:
3791/// or (or (setCA (cmp A)) (setCB (cmp B)))
3792/// (and (setCC (cmp C)) (setCD (cmp D)))"
3793/// can be reassociated to:
3794/// or (and (setCC (cmp C)) setCD (cmp D))
3795// (or (setCA (cmp A)) (setCB (cmp B)))
3796/// can be transformed to:
3797/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3798/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3799/// which can be implemented as:
3800/// cmp C
3801/// ccmp D, inv(CD), CC
3802/// ccmp A, CA, inv(CD)
3803/// ccmp B, CB, inv(CA)
3804/// check for CB flags
3805///
3806/// A counterexample is "or (and A B) (and C D)" which translates to
3807/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3808/// can only implement 1 of the inner (not) operations, but not both!
3809/// @{
3810
3811/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3813 ISD::CondCode CC, SDValue CCOp,
3815 AArch64CC::CondCode OutCC,
3816 const SDLoc &DL, SelectionDAG &DAG) {
3817 unsigned Opcode = 0;
3818 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3819
3820 if (LHS.getValueType().isFloatingPoint()) {
3821 assert(LHS.getValueType() != MVT::f128);
3822 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3823 LHS.getValueType() == MVT::bf16) {
3824 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3825 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3826 }
3827 Opcode = AArch64ISD::FCCMP;
3828 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3829 APInt Imm = Const->getAPIntValue();
3830 if (Imm.isNegative() && Imm.sgt(-32)) {
3831 Opcode = AArch64ISD::CCMN;
3832 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3833 }
3834 } else if (isCMN(RHS, CC, DAG)) {
3835 Opcode = AArch64ISD::CCMN;
3836 RHS = RHS.getOperand(1);
3837 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3838 isIntEqualitySetCC(CC)) {
3839 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3840 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3841 Opcode = AArch64ISD::CCMN;
3842 LHS = LHS.getOperand(1);
3843 }
3844 if (Opcode == 0)
3845 Opcode = AArch64ISD::CCMP;
3846
3847 SDValue Condition = getCondCode(DAG, Predicate);
3849 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3850 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3851 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3852}
3853
3854/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3855/// expressed as a conjunction. See \ref AArch64CCMP.
3856/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3857/// changing the conditions on the SETCC tests.
3858/// (this means we can call emitConjunctionRec() with
3859/// Negate==true on this sub-tree)
3860/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3861/// cannot do the negation naturally. We are required to
3862/// emit the subtree first in this case.
3863/// \param WillNegate Is true if are called when the result of this
3864/// subexpression must be negated. This happens when the
3865/// outer expression is an OR. We can use this fact to know
3866/// that we have a double negation (or (or ...) ...) that
3867/// can be implemented for free.
3868static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3869 bool &MustBeFirst, bool WillNegate,
3870 unsigned Depth = 0) {
3871 if (!Val.hasOneUse())
3872 return false;
3873 unsigned Opcode = Val->getOpcode();
3874 if (Opcode == ISD::SETCC) {
3875 if (Val->getOperand(0).getValueType() == MVT::f128)
3876 return false;
3877 CanNegate = true;
3878 MustBeFirst = false;
3879 return true;
3880 }
3881 // Protect against exponential runtime and stack overflow.
3882 if (Depth > 6)
3883 return false;
3884 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3885 bool IsOR = Opcode == ISD::OR;
3886 SDValue O0 = Val->getOperand(0);
3887 SDValue O1 = Val->getOperand(1);
3888 bool CanNegateL;
3889 bool MustBeFirstL;
3890 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3891 return false;
3892 bool CanNegateR;
3893 bool MustBeFirstR;
3894 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3895 return false;
3896
3897 if (MustBeFirstL && MustBeFirstR)
3898 return false;
3899
3900 if (IsOR) {
3901 // For an OR expression we need to be able to naturally negate at least
3902 // one side or we cannot do the transformation at all.
3903 if (!CanNegateL && !CanNegateR)
3904 return false;
3905 // If we the result of the OR will be negated and we can naturally negate
3906 // the leafs, then this sub-tree as a whole negates naturally.
3907 CanNegate = WillNegate && CanNegateL && CanNegateR;
3908 // If we cannot naturally negate the whole sub-tree, then this must be
3909 // emitted first.
3910 MustBeFirst = !CanNegate;
3911 } else {
3912 assert(Opcode == ISD::AND && "Must be OR or AND");
3913 // We cannot naturally negate an AND operation.
3914 CanNegate = false;
3915 MustBeFirst = MustBeFirstL || MustBeFirstR;
3916 }
3917 return true;
3918 }
3919 return false;
3920}
3921
3922/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3923/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3924/// Tries to transform the given i1 producing node @p Val to a series compare
3925/// and conditional compare operations. @returns an NZCV flags producing node
3926/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3927/// transformation was not possible.
3928/// \p Negate is true if we want this sub-tree being negated just by changing
3929/// SETCC conditions.
3931 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3933 // We're at a tree leaf, produce a conditional comparison operation.
3934 unsigned Opcode = Val->getOpcode();
3935 if (Opcode == ISD::SETCC) {
3936 SDValue LHS = Val->getOperand(0);
3937 SDValue RHS = Val->getOperand(1);
3938 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3939 bool isInteger = LHS.getValueType().isInteger();
3940 if (Negate)
3941 CC = getSetCCInverse(CC, LHS.getValueType());
3942 SDLoc DL(Val);
3943 // Determine OutCC and handle FP special case.
3944 if (isInteger) {
3945 OutCC = changeIntCCToAArch64CC(CC, RHS);
3946 } else {
3947 assert(LHS.getValueType().isFloatingPoint());
3948 AArch64CC::CondCode ExtraCC;
3949 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3950 // Some floating point conditions can't be tested with a single condition
3951 // code. Construct an additional comparison in this case.
3952 if (ExtraCC != AArch64CC::AL) {
3953 SDValue ExtraCmp;
3954 if (!CCOp.getNode())
3955 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3956 else
3957 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3958 ExtraCC, DL, DAG);
3959 CCOp = ExtraCmp;
3960 Predicate = ExtraCC;
3961 }
3962 }
3963
3964 // Produce a normal comparison if we are first in the chain
3965 if (!CCOp)
3966 return emitComparison(LHS, RHS, CC, DL, DAG);
3967 // Otherwise produce a ccmp.
3968 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3969 DAG);
3970 }
3971 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3972
3973 bool IsOR = Opcode == ISD::OR;
3974
3975 SDValue LHS = Val->getOperand(0);
3976 bool CanNegateL;
3977 bool MustBeFirstL;
3978 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3979 assert(ValidL && "Valid conjunction/disjunction tree");
3980 (void)ValidL;
3981
3982 SDValue RHS = Val->getOperand(1);
3983 bool CanNegateR;
3984 bool MustBeFirstR;
3985 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3986 assert(ValidR && "Valid conjunction/disjunction tree");
3987 (void)ValidR;
3988
3989 // Swap sub-tree that must come first to the right side.
3990 if (MustBeFirstL) {
3991 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3992 std::swap(LHS, RHS);
3993 std::swap(CanNegateL, CanNegateR);
3994 std::swap(MustBeFirstL, MustBeFirstR);
3995 }
3996
3997 bool NegateR;
3998 bool NegateAfterR;
3999 bool NegateL;
4000 bool NegateAfterAll;
4001 if (Opcode == ISD::OR) {
4002 // Swap the sub-tree that we can negate naturally to the left.
4003 if (!CanNegateL) {
4004 assert(CanNegateR && "at least one side must be negatable");
4005 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4006 assert(!Negate);
4007 std::swap(LHS, RHS);
4008 NegateR = false;
4009 NegateAfterR = true;
4010 } else {
4011 // Negate the left sub-tree if possible, otherwise negate the result.
4012 NegateR = CanNegateR;
4013 NegateAfterR = !CanNegateR;
4014 }
4015 NegateL = true;
4016 NegateAfterAll = !Negate;
4017 } else {
4018 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4019 assert(!Negate && "Valid conjunction/disjunction tree");
4020
4021 NegateL = false;
4022 NegateR = false;
4023 NegateAfterR = false;
4024 NegateAfterAll = false;
4025 }
4026
4027 // Emit sub-trees.
4028 AArch64CC::CondCode RHSCC;
4029 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4030 if (NegateAfterR)
4031 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4032 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4033 if (NegateAfterAll)
4034 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4035 return CmpL;
4036}
4037
4038/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4039/// In some cases this is even possible with OR operations in the expression.
4040/// See \ref AArch64CCMP.
4041/// \see emitConjunctionRec().
4043 AArch64CC::CondCode &OutCC) {
4044 bool DummyCanNegate;
4045 bool DummyMustBeFirst;
4046 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
4047 return SDValue();
4048
4049 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4050}
4051
4052/// @}
4053
4054/// Returns how profitable it is to fold a comparison's operand's shift and/or
4055/// extension operations.
4057 auto isSupportedExtend = [&](SDValue V) {
4058 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4059 return true;
4060
4061 if (V.getOpcode() == ISD::AND)
4062 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4063 uint64_t Mask = MaskCst->getZExtValue();
4064 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4065 }
4066
4067 return false;
4068 };
4069
4070 if (!Op.hasOneUse())
4071 return 0;
4072
4073 if (isSupportedExtend(Op))
4074 return 1;
4075
4076 unsigned Opc = Op.getOpcode();
4077 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4078 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4079 uint64_t Shift = ShiftCst->getZExtValue();
4080 if (isSupportedExtend(Op.getOperand(0)))
4081 return (Shift <= 4) ? 2 : 1;
4082 EVT VT = Op.getValueType();
4083 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4084 return 1;
4085 }
4086
4087 return 0;
4088}
4089
4090// emitComparison() converts comparison with one or negative one to comparison
4091// with 0. Note that this only works for signed comparisons because of how ANDS
4092// works.
4094 // Only works for ANDS and AND.
4095 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4096 return false;
4097
4098 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4099 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4100 return true;
4101 }
4102
4103 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4104 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4105 return true;
4106 }
4107
4108 return false;
4109}
4110
4112 SDValue &AArch64cc, SelectionDAG &DAG,
4113 const SDLoc &DL) {
4114 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4115 EVT VT = RHS.getValueType();
4116 APInt C = RHSC->getAPIntValue();
4117 // shouldBeAdjustedToZero is a special case to better fold with
4118 // emitComparison().
4119 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4120 // Adjust the constant to zero.
4121 // CC has already been adjusted.
4122 RHS = DAG.getConstant(0, DL, VT);
4123 } else if (!isLegalCmpImmed(C)) {
4124 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4125 // Constant does not fit, try adjusting it by one?
4126 switch (CC) {
4127 default:
4128 break;
4129 case ISD::SETLT:
4130 case ISD::SETGE:
4131 if (!C.isMinSignedValue()) {
4132 APInt CMinusOne = C - 1;
4133 if (isLegalCmpImmed(CMinusOne) ||
4134 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4135 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4136 RHS = DAG.getConstant(CMinusOne, DL, VT);
4137 }
4138 }
4139 break;
4140 case ISD::SETULT:
4141 case ISD::SETUGE: {
4142 // C is not 0 because it is a legal immediate.
4143 assert(!C.isZero() && "C should not be zero here");
4144 APInt CMinusOne = C - 1;
4145 if (isLegalCmpImmed(CMinusOne) ||
4146 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4147 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4148 RHS = DAG.getConstant(CMinusOne, DL, VT);
4149 }
4150 break;
4151 }
4152 case ISD::SETLE:
4153 case ISD::SETGT:
4154 if (!C.isMaxSignedValue()) {
4155 APInt CPlusOne = C + 1;
4156 if (isLegalCmpImmed(CPlusOne) ||
4157 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4158 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4159 RHS = DAG.getConstant(CPlusOne, DL, VT);
4160 }
4161 }
4162 break;
4163 case ISD::SETULE:
4164 case ISD::SETUGT: {
4165 if (!C.isAllOnes()) {
4166 APInt CPlusOne = C + 1;
4167 if (isLegalCmpImmed(CPlusOne) ||
4168 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4169 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4170 RHS = DAG.getConstant(CPlusOne, DL, VT);
4171 }
4172 }
4173 break;
4174 }
4175 }
4176 }
4177 }
4178
4179 // Comparisons are canonicalized so that the RHS operand is simpler than the
4180 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4181 // can fold some shift+extend operations on the RHS operand, so swap the
4182 // operands if that can be done.
4183 //
4184 // For example:
4185 // lsl w13, w11, #1
4186 // cmp w13, w12
4187 // can be turned into:
4188 // cmp w12, w11, lsl #1
4189 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4190 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4191 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4192 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4193 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4194
4195 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4196 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4197 std::swap(LHS, RHS);
4199 }
4200 }
4201
4202 SDValue Cmp;
4204 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4206
4207 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4208 // For the i8 operand, the largest immediate is 255, so this can be easily
4209 // encoded in the compare instruction. For the i16 operand, however, the
4210 // largest immediate cannot be encoded in the compare.
4211 // Therefore, use a sign extending load and cmn to avoid materializing the
4212 // -1 constant. For example,
4213 // movz w1, #65535
4214 // ldrh w0, [x0, #0]
4215 // cmp w0, w1
4216 // >
4217 // ldrsh w0, [x0, #0]
4218 // cmn w0, #1
4219 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4220 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4221 // ensure both the LHS and RHS are truly zero extended and to make sure the
4222 // transformation is profitable.
4223 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4224 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4225 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4226 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4227 int16_t ValueofRHS = RHS->getAsZExtVal();
4228 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4229 SDValue SExt =
4230 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4231 DAG.getValueType(MVT::i16));
4232 Cmp = emitComparison(
4233 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4234 DL, DAG);
4236 }
4237 }
4238
4239 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4240 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4241 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4243 }
4244 }
4245 }
4246
4247 if (!Cmp) {
4248 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4250 }
4251 AArch64cc = getCondCode(DAG, AArch64CC);
4252 return Cmp;
4253}
4254
4255static std::pair<SDValue, SDValue>
4257 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4258 "Unsupported value type");
4259 SDValue Value, Overflow;
4260 SDLoc DL(Op);
4261 SDValue LHS = Op.getOperand(0);
4262 SDValue RHS = Op.getOperand(1);
4263 unsigned Opc = 0;
4264 switch (Op.getOpcode()) {
4265 default:
4266 llvm_unreachable("Unknown overflow instruction!");
4267 case ISD::SADDO:
4268 Opc = AArch64ISD::ADDS;
4269 CC = AArch64CC::VS;
4270 break;
4271 case ISD::UADDO:
4272 Opc = AArch64ISD::ADDS;
4273 CC = AArch64CC::HS;
4274 break;
4275 case ISD::SSUBO:
4276 Opc = AArch64ISD::SUBS;
4277 CC = AArch64CC::VS;
4278 break;
4279 case ISD::USUBO:
4280 Opc = AArch64ISD::SUBS;
4281 CC = AArch64CC::LO;
4282 break;
4283 // Multiply needs a little bit extra work.
4284 case ISD::SMULO:
4285 case ISD::UMULO: {
4286 CC = AArch64CC::NE;
4287 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4288 if (Op.getValueType() == MVT::i32) {
4289 // Extend to 64-bits, then perform a 64-bit multiply.
4290 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4291 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4292 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4293 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4294 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4295
4296 // Check that the result fits into a 32-bit integer.
4297 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4298 if (IsSigned) {
4299 // cmp xreg, wreg, sxtw
4300 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4301 Overflow =
4302 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4303 } else {
4304 // tst xreg, #0xffffffff00000000
4305 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4306 Overflow =
4307 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4308 }
4309 break;
4310 }
4311 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4312 // For the 64 bit multiply
4313 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4314 if (IsSigned) {
4315 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4316 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4317 DAG.getConstant(63, DL, MVT::i64));
4318 // It is important that LowerBits is last, otherwise the arithmetic
4319 // shift will not be folded into the compare (SUBS).
4320 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4321 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4322 .getValue(1);
4323 } else {
4324 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4325 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4326 Overflow =
4327 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4328 DAG.getConstant(0, DL, MVT::i64),
4329 UpperBits).getValue(1);
4330 }
4331 break;
4332 }
4333 } // switch (...)
4334
4335 if (Opc) {
4336 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4337
4338 // Emit the AArch64 operation with overflow check.
4339 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4340 Overflow = Value.getValue(1);
4341 }
4342 return std::make_pair(Value, Overflow);
4343}
4344
4345SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4346 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4347 !Subtarget->isNeonAvailable()))
4348 return LowerToScalableOp(Op, DAG);
4349
4350 SDValue Sel = Op.getOperand(0);
4351 SDValue Other = Op.getOperand(1);
4352 SDLoc DL(Sel);
4353
4354 // If the operand is an overflow checking operation, invert the condition
4355 // code and kill the Not operation. I.e., transform:
4356 // (xor (overflow_op_bool, 1))
4357 // -->
4358 // (csel 1, 0, invert(cc), overflow_op_bool)
4359 // ... which later gets transformed to just a cset instruction with an
4360 // inverted condition code, rather than a cset + eor sequence.
4362 // Only lower legal XALUO ops.
4364 return SDValue();
4365
4366 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4367 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4369 SDValue Value, Overflow;
4370 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4371 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4372 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4373 CCVal, Overflow);
4374 }
4375 // If neither operand is a SELECT_CC, give up.
4376 if (Sel.getOpcode() != ISD::SELECT_CC)
4377 std::swap(Sel, Other);
4378 if (Sel.getOpcode() != ISD::SELECT_CC)
4379 return Op;
4380
4381 // The folding we want to perform is:
4382 // (xor x, (select_cc a, b, cc, 0, -1) )
4383 // -->
4384 // (csel x, (xor x, -1), cc ...)
4385 //
4386 // The latter will get matched to a CSINV instruction.
4387
4388 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4389 SDValue LHS = Sel.getOperand(0);
4390 SDValue RHS = Sel.getOperand(1);
4391 SDValue TVal = Sel.getOperand(2);
4392 SDValue FVal = Sel.getOperand(3);
4393
4394 // FIXME: This could be generalized to non-integer comparisons.
4395 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4396 return Op;
4397
4398 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4399 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4400
4401 // The values aren't constants, this isn't the pattern we're looking for.
4402 if (!CFVal || !CTVal)
4403 return Op;
4404
4405 // We can commute the SELECT_CC by inverting the condition. This
4406 // might be needed to make this fit into a CSINV pattern.
4407 if (CTVal->isAllOnes() && CFVal->isZero()) {
4408 std::swap(TVal, FVal);
4409 std::swap(CTVal, CFVal);
4410 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4411 }
4412
4413 // If the constants line up, perform the transform!
4414 if (CTVal->isZero() && CFVal->isAllOnes()) {
4415 SDValue CCVal;
4416 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4417
4418 FVal = Other;
4419 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4420 DAG.getAllOnesConstant(DL, Other.getValueType()));
4421
4422 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4423 CCVal, Cmp);
4424 }
4425
4426 return Op;
4427}
4428
4429// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4430// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4431// sets 'C' bit to 0.
4433 SDLoc DL(Value);
4434 EVT VT = Value.getValueType();
4435 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4436 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4437 SDValue Cmp =
4438 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4439 return Cmp.getValue(1);
4440}
4441
4442// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4443// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4445 bool Invert) {
4446 assert(Glue.getResNo() == 1);
4447 SDLoc DL(Glue);
4448 SDValue Zero = DAG.getConstant(0, DL, VT);
4449 SDValue One = DAG.getConstant(1, DL, VT);
4451 SDValue CC = getCondCode(DAG, Cond);
4452 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4453}
4454
4455// Value is 1 if 'V' bit of NZCV is 1, else 0
4457 assert(Glue.getResNo() == 1);
4458 SDLoc DL(Glue);
4459 SDValue Zero = DAG.getConstant(0, DL, VT);
4460 SDValue One = DAG.getConstant(1, DL, VT);
4462 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4463}
4464
4465// This lowering is inefficient, but it will get cleaned up by
4466// `foldOverflowCheck`
4468 unsigned Opcode, bool IsSigned) {
4469 EVT VT0 = Op.getValue(0).getValueType();
4470 EVT VT1 = Op.getValue(1).getValueType();
4471
4472 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4473 return SDValue();
4474
4475 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4476 SDValue OpLHS = Op.getOperand(0);
4477 SDValue OpRHS = Op.getOperand(1);
4478 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4479
4480 SDLoc DL(Op);
4481
4482 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4483 OpRHS, OpCarryIn);
4484
4485 SDValue OutFlag =
4486 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4487 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4488
4489 return DAG.getMergeValues({Sum, OutFlag}, DL);
4490}
4491
4493 // Let legalize expand this if it isn't a legal type yet.
4494 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4495 return SDValue();
4496
4497 SDLoc DL(Op);
4499 // The actual operation that sets the overflow or carry flag.
4500 SDValue Value, Overflow;
4501 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4502
4503 // We use 0 and 1 as false and true values.
4504 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4505 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4506
4507 // We use an inverted condition, because the conditional select is inverted
4508 // too. This will allow it to be selected to a single instruction:
4509 // CSINC Wd, WZR, WZR, invert(cond).
4510 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4511 Overflow =
4512 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4513
4514 return DAG.getMergeValues({Value, Overflow}, DL);
4515}
4516
4517// Prefetch operands are:
4518// 1: Address to prefetch
4519// 2: bool isWrite
4520// 3: int locality (0 = no locality ... 3 = extreme locality)
4521// 4: bool isDataCache
4523 SDLoc DL(Op);
4524 unsigned IsWrite = Op.getConstantOperandVal(2);
4525 unsigned Locality = Op.getConstantOperandVal(3);
4526 unsigned IsData = Op.getConstantOperandVal(4);
4527
4528 bool IsStream = !Locality;
4529 // When the locality number is set
4530 if (Locality) {
4531 // The front-end should have filtered out the out-of-range values
4532 assert(Locality <= 3 && "Prefetch locality out-of-range");
4533 // The locality degree is the opposite of the cache speed.
4534 // Put the number the other way around.
4535 // The encoding starts at 0 for level 1
4536 Locality = 3 - Locality;
4537 }
4538
4539 // built the mask value encoding the expected behavior.
4540 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4541 (!IsData << 3) | // IsDataCache bit
4542 (Locality << 1) | // Cache level bits
4543 (unsigned)IsStream; // Stream bit
4544 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4545 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4546 Op.getOperand(1));
4547}
4548
4549// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4550// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4551// (AND X Y) Z which produces a better opt with EmitComparison
4553 SelectionDAG &DAG, const SDLoc DL) {
4554 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4555 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4557 if (LHSConstOp && RHSConst) {
4558 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4559 uint64_t RHSConstant = RHSConst->getZExtValue();
4560 if (isPowerOf2_64(RHSConstant)) {
4561 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4562 LHS =
4563 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4564 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4565 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4566 CC = ISD::SETEQ;
4567 }
4568 }
4569 }
4570}
4571
4572SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4573 SelectionDAG &DAG) const {
4574 EVT VT = Op.getValueType();
4575 if (VT.isScalableVector()) {
4576 SDValue SrcVal = Op.getOperand(0);
4577
4578 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4579 // Break conversion in two with the first part converting to f32 and the
4580 // second using native f32->VT instructions.
4581 SDLoc DL(Op);
4582 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4583 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4584 }
4585
4586 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4587 }
4588
4589 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4590 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4591
4592 bool IsStrict = Op->isStrictFPOpcode();
4593 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4594 EVT Op0VT = Op0.getValueType();
4595 if (VT == MVT::f64) {
4596 // FP16->FP32 extends are legal for v32 and v4f32.
4597 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4598 return Op;
4599 // Split bf16->f64 extends into two fpextends.
4600 if (Op0VT == MVT::bf16 && IsStrict) {
4601 SDValue Ext1 =
4602 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4603 {Op0, Op.getOperand(0)});
4604 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4605 {Ext1, Ext1.getValue(1)});
4606 }
4607 if (Op0VT == MVT::bf16)
4608 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4609 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4610 return SDValue();
4611 }
4612
4613 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4614 return SDValue();
4615}
4616
4617SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4618 SelectionDAG &DAG) const {
4619 EVT VT = Op.getValueType();
4620 bool IsStrict = Op->isStrictFPOpcode();
4621 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4622 EVT SrcVT = SrcVal.getValueType();
4623 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4624
4625 if (VT.isScalableVector()) {
4626 // Let common code split the operation.
4627 if (SrcVT == MVT::nxv8f32)
4628 return Op;
4629
4630 if (VT.getScalarType() != MVT::bf16)
4631 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4632
4633 SDLoc DL(Op);
4634 constexpr EVT I32 = MVT::nxv4i32;
4635 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4636
4637 SDValue NaN;
4638 SDValue Narrow;
4639
4640 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4641 if (Subtarget->hasBF16())
4642 return LowerToPredicatedOp(Op, DAG,
4643 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4644
4645 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4646
4647 // Set the quiet bit.
4648 if (!DAG.isKnownNeverSNaN(SrcVal))
4649 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4650 } else if (SrcVT == MVT::nxv2f64 &&
4651 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4652 // Round to float without introducing rounding errors and try again.
4653 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4654 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4655 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4656
4658 if (IsStrict)
4659 NewOps.push_back(Op.getOperand(0));
4660 NewOps.push_back(Narrow);
4661 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4662 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4663 } else
4664 return SDValue();
4665
4666 if (!Trunc) {
4667 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4668 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4669 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4670 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4671 }
4672
4673 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4674 // 0x80000000.
4675 if (NaN) {
4676 EVT I1 = I32.changeElementType(MVT::i1);
4677 EVT CondVT = VT.changeElementType(MVT::i1);
4678 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4679 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4680 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4681 }
4682
4683 // Now that we have rounded, shift the bits into position.
4684 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4685 return getSVESafeBitCast(VT, Narrow, DAG);
4686 }
4687
4688 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4689 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4690
4691 // Expand cases where the result type is BF16 but we don't have hardware
4692 // instructions to lower it.
4693 if (VT.getScalarType() == MVT::bf16 &&
4694 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4695 Subtarget->hasBF16())) {
4696 SDLoc DL(Op);
4697 SDValue Narrow = SrcVal;
4698 SDValue NaN;
4699 EVT I32 = SrcVT.changeElementType(MVT::i32);
4700 EVT F32 = SrcVT.changeElementType(MVT::f32);
4701 if (SrcVT.getScalarType() == MVT::f32) {
4702 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4703 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4704 if (!NeverSNaN) {
4705 // Set the quiet bit.
4706 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4707 DAG.getConstant(0x400000, DL, I32));
4708 }
4709 } else if (SrcVT.getScalarType() == MVT::f64) {
4710 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4711 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4712 } else {
4713 return SDValue();
4714 }
4715 if (!Trunc) {
4716 SDValue One = DAG.getConstant(1, DL, I32);
4717 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4718 DAG.getShiftAmountConstant(16, I32, DL));
4719 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4720 SDValue RoundingBias =
4721 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4722 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4723 }
4724
4725 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4726 // 0x80000000.
4727 if (NaN) {
4728 SDValue IsNaN = DAG.getSetCC(
4729 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4730 SrcVal, SrcVal, ISD::SETUO);
4731 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4732 }
4733
4734 // Now that we have rounded, shift the bits into position.
4735 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4736 DAG.getShiftAmountConstant(16, I32, DL));
4737 if (VT.isVector()) {
4738 EVT I16 = I32.changeVectorElementType(MVT::i16);
4739 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4740 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4741 }
4742 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4743 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4744 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4745 : Result;
4746 }
4747
4748 if (SrcVT != MVT::f128) {
4749 // Expand cases where the input is a vector bigger than NEON.
4751 return SDValue();
4752
4753 // It's legal except when f128 is involved
4754 return Op;
4755 }
4756
4757 return SDValue();
4758}
4759
4760SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4761 SelectionDAG &DAG) const {
4762 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4763 // Any additional optimization in this function should be recorded
4764 // in the cost tables.
4765 bool IsStrict = Op->isStrictFPOpcode();
4766 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4767 EVT VT = Op.getValueType();
4768
4769 assert(!(IsStrict && VT.isScalableVector()) &&
4770 "Unimplemented SVE support for STRICT_FP_to_INT!");
4771
4772 // f16 conversions are promoted to f32 when full fp16 is not supported.
4773 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4774 InVT.getVectorElementType() == MVT::bf16) {
4775 EVT NewVT = VT.changeElementType(MVT::f32);
4776 SDLoc DL(Op);
4777 if (IsStrict) {
4778 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4779 {Op.getOperand(0), Op.getOperand(1)});
4780 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4781 {Ext.getValue(1), Ext.getValue(0)});
4782 }
4783 return DAG.getNode(
4784 Op.getOpcode(), DL, Op.getValueType(),
4785 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4786 }
4787
4788 if (VT.isScalableVector()) {
4789 if (VT.getVectorElementType() == MVT::i1) {
4790 SDLoc DL(Op);
4791 EVT CvtVT = getPromotedVTForPredicate(VT);
4792 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4793 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4794 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4795 }
4796
4797 // Let common code split the operation.
4798 if (InVT == MVT::nxv8f32)
4799 return Op;
4800
4801 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4802 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4803 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4804 return LowerToPredicatedOp(Op, DAG, Opcode);
4805 }
4806
4807 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4808 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4809 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4810
4811 uint64_t VTSize = VT.getFixedSizeInBits();
4812 uint64_t InVTSize = InVT.getFixedSizeInBits();
4813 if (VTSize < InVTSize) {
4814 SDLoc DL(Op);
4815 if (IsStrict) {
4817 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4818 {Op.getOperand(0), Op.getOperand(1)});
4819 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4820 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4821 }
4822 SDValue Cv =
4823 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4824 Op.getOperand(0));
4825 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4826 }
4827
4828 if (VTSize > InVTSize) {
4829 SDLoc DL(Op);
4830 MVT ExtVT =
4833 if (IsStrict) {
4834 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4835 {Op.getOperand(0), Op.getOperand(1)});
4836 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4837 {Ext.getValue(1), Ext.getValue(0)});
4838 }
4839 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4840 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4841 }
4842
4843 // Use a scalar operation for conversions between single-element vectors of
4844 // the same size.
4845 if (InVT.getVectorNumElements() == 1) {
4846 SDLoc DL(Op);
4847 SDValue Extract = DAG.getNode(
4849 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4850 EVT ScalarVT = VT.getScalarType();
4851 if (IsStrict)
4852 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4853 {Op.getOperand(0), Extract});
4854 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4855 }
4856
4857 // Type changing conversions are illegal.
4858 return Op;
4859}
4860
4861SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4862 SelectionDAG &DAG) const {
4863 bool IsStrict = Op->isStrictFPOpcode();
4864 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4865
4866 if (SrcVal.getValueType().isVector())
4867 return LowerVectorFP_TO_INT(Op, DAG);
4868
4869 // f16 conversions are promoted to f32 when full fp16 is not supported.
4870 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4871 SrcVal.getValueType() == MVT::bf16) {
4872 SDLoc DL(Op);
4873 if (IsStrict) {
4874 SDValue Ext =
4875 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4876 {Op.getOperand(0), SrcVal});
4877 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4878 {Ext.getValue(1), Ext.getValue(0)});
4879 }
4880 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4881 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4882 }
4883
4884 if (SrcVal.getValueType() != MVT::f128) {
4885 // It's legal except when f128 is involved
4886 return Op;
4887 }
4888
4889 return SDValue();
4890}
4891
4892SDValue
4893AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4894 SelectionDAG &DAG) const {
4895 // AArch64 FP-to-int conversions saturate to the destination element size, so
4896 // we can lower common saturating conversions to simple instructions.
4897 SDValue SrcVal = Op.getOperand(0);
4898 EVT SrcVT = SrcVal.getValueType();
4899 EVT DstVT = Op.getValueType();
4900 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4901
4902 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4903 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4904 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4905 assert(SatWidth <= DstElementWidth &&
4906 "Saturation width cannot exceed result width");
4907
4908 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4909 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4910 // types, so this is hard to reach.
4911 if (DstVT.isScalableVector())
4912 return SDValue();
4913
4914 EVT SrcElementVT = SrcVT.getVectorElementType();
4915
4916 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4917 SDLoc DL(Op);
4918 SDValue SrcVal2;
4919 if ((SrcElementVT == MVT::f16 &&
4920 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4921 SrcElementVT == MVT::bf16) {
4922 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4923 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4924 // If we are extending to a v8f32, split into two v4f32 to produce legal
4925 // types.
4926 if (F32VT.getSizeInBits() > 128) {
4927 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4928 F32VT = F32VT.getHalfNumVectorElementsVT();
4929 }
4930 SrcVT = F32VT;
4931 SrcElementVT = MVT::f32;
4932 SrcElementWidth = 32;
4933 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4934 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4935 return SDValue();
4936
4937 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4938 // width and produce a fcvtzu.
4939 if (SatWidth == 64 && SrcElementWidth < 64) {
4940 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4941 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4942 SrcVT = F64VT;
4943 SrcElementVT = MVT::f64;
4944 SrcElementWidth = 64;
4945 }
4946 // Cases that we can emit directly.
4947 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4948 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4949 DAG.getValueType(DstVT.getScalarType()));
4950 if (SrcVal2) {
4951 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4952 DAG.getValueType(DstVT.getScalarType()));
4953 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4954 }
4955 return Res;
4956 }
4957
4958 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4959 // result. This is only valid if the legal cvt is larger than the saturate
4960 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4961 // (at least until sqxtn is selected).
4962 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4963 return SDValue();
4964
4965 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4966 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4967 DAG.getValueType(IntVT.getScalarType()));
4968 SDValue NativeCvt2 =
4969 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4970 DAG.getValueType(IntVT.getScalarType()))
4971 : SDValue();
4972 SDValue Sat, Sat2;
4973 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4974 SDValue MinC = DAG.getConstant(
4975 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4976 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4977 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4978 SDValue MaxC = DAG.getConstant(
4979 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4980 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4981 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4982 } else {
4983 SDValue MinC = DAG.getConstant(
4984 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4985 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4986 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4987 }
4988
4989 if (SrcVal2)
4990 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4992 Sat, Sat2);
4993
4994 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4995}
4996
4997SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4998 SelectionDAG &DAG) const {
4999 // AArch64 FP-to-int conversions saturate to the destination register size, so
5000 // we can lower common saturating conversions to simple instructions.
5001 SDValue SrcVal = Op.getOperand(0);
5002 EVT SrcVT = SrcVal.getValueType();
5003
5004 if (SrcVT.isVector())
5005 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5006
5007 EVT DstVT = Op.getValueType();
5008 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5009 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5010 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5011 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5012
5013 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5014 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5015 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5016 SrcVT = MVT::f32;
5017 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5018 SrcVT != MVT::bf16)
5019 return SDValue();
5020
5021 SDLoc DL(Op);
5022 // Cases that we can emit directly.
5023 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5024 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5025 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5026 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5027 DAG.getValueType(DstVT));
5028
5029 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5030 // result. This is only valid if the legal cvt is larger than the saturate
5031 // width.
5032 if (DstWidth < SatWidth)
5033 return SDValue();
5034
5035 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5036 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5037 SDValue CVTf32 =
5038 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5039 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5040 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5041 DAG.getValueType(SatVT));
5042 }
5043 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5044 return DAG.getBitcast(DstVT, CVTf32);
5045 }
5046
5047 SDValue NativeCvt =
5048 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5049 SDValue Sat;
5050 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5051 SDValue MinC = DAG.getConstant(
5052 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5053 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5054 SDValue MaxC = DAG.getConstant(
5055 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5056 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5057 } else {
5058 SDValue MinC = DAG.getConstant(
5059 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5060 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5061 }
5062
5063 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5064}
5065
5066SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5067 SelectionDAG &DAG) const {
5068 EVT VT = Op.getValueType();
5069 SDValue Src = Op.getOperand(0);
5070 SDLoc DL(Op);
5071
5072 assert(VT.isVector() && "Expected vector type");
5073
5074 EVT CastVT =
5075 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5076
5077 // Round the floating-point value into a floating-point register with the
5078 // current rounding mode.
5079 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5080
5081 // Truncate the rounded floating point to an integer.
5082 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5084}
5085
5086SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5087 SelectionDAG &DAG) const {
5088 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5089 // Any additional optimization in this function should be recorded
5090 // in the cost tables.
5091 bool IsStrict = Op->isStrictFPOpcode();
5092 EVT VT = Op.getValueType();
5093 SDLoc DL(Op);
5094 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5095 EVT InVT = In.getValueType();
5096 unsigned Opc = Op.getOpcode();
5097 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5098
5099 assert(!(IsStrict && VT.isScalableVector()) &&
5100 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5101
5102 // NOTE: i1->bf16 does not require promotion to f32.
5103 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5104 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5105 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5106 : DAG.getConstantFP(1.0, DL, VT);
5107 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5108 }
5109
5110 // Promote bf16 conversions to f32.
5111 if (VT.getVectorElementType() == MVT::bf16) {
5112 EVT F32 = VT.changeElementType(MVT::f32);
5113 if (IsStrict) {
5114 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5115 {Op.getOperand(0), In});
5116 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5117 {Op.getValueType(), MVT::Other},
5118 {Val.getValue(1), Val.getValue(0),
5119 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5120 }
5121 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5122 DAG.getNode(Op.getOpcode(), DL, F32, In),
5123 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5124 }
5125
5126 if (VT.isScalableVector()) {
5127 // Let common code split the operation.
5128 if (VT == MVT::nxv8f32)
5129 return Op;
5130
5131 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5132 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5133 return LowerToPredicatedOp(Op, DAG, Opcode);
5134 }
5135
5136 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5137 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5138 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5139
5140 uint64_t VTSize = VT.getFixedSizeInBits();
5141 uint64_t InVTSize = InVT.getFixedSizeInBits();
5142 if (VTSize < InVTSize) {
5143 // AArch64 doesn't have a direct vector instruction to convert
5144 // fixed point to floating point AND narrow it at the same time.
5145 // Additional rounding when the target is f32/f64 causes double
5146 // rounding issues. Conversion to f16 is fine due to narrow width.
5147 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5148 bool IsTargetf16 = false;
5149 if (Op.hasOneUse() &&
5150 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5151 // Some vector types are split during legalization into half, followed by
5152 // concatenation, followed by rounding to the original vector type. If we
5153 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5154 SDNode *U = *Op->user_begin();
5155 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5156 EVT TmpVT = U->user_begin()->getValueType(0);
5157 if (TmpVT.getScalarType() == MVT::f16)
5158 IsTargetf16 = true;
5159 }
5160 }
5161
5162 if (IsTargetf32 && !IsTargetf16) {
5163 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5164 }
5165
5166 MVT CastVT =
5168 InVT.getVectorNumElements());
5169 if (IsStrict) {
5170 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5171 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5172 {In.getValue(1), In.getValue(0),
5173 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5174 }
5175 In = DAG.getNode(Opc, DL, CastVT, In);
5176 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5177 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5178 }
5179
5180 if (VTSize > InVTSize) {
5181 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5182 EVT CastVT = VT.changeVectorElementTypeToInteger();
5183 In = DAG.getNode(CastOpc, DL, CastVT, In);
5184 if (IsStrict)
5185 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5186 return DAG.getNode(Opc, DL, VT, In);
5187 }
5188
5189 // Use a scalar operation for conversions between single-element vectors of
5190 // the same size.
5191 if (VT.getVectorNumElements() == 1) {
5192 SDValue Extract =
5194 DAG.getConstant(0, DL, MVT::i64));
5195 EVT ScalarVT = VT.getScalarType();
5196 if (IsStrict)
5197 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5198 {Op.getOperand(0), Extract});
5199 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5200 }
5201
5202 return Op;
5203}
5204
5205SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5206 SelectionDAG &DAG) const {
5207 if (Op.getValueType().isVector())
5208 return LowerVectorINT_TO_FP(Op, DAG);
5209
5210 bool IsStrict = Op->isStrictFPOpcode();
5211 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5212
5213 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5214 Op->getOpcode() == ISD::SINT_TO_FP;
5215
5216 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5217 SDLoc DL(Op);
5218 if (IsStrict) {
5219 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5220 {Op.getOperand(0), SrcVal});
5221 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5222 {Op.getValueType(), MVT::Other},
5223 {Val.getValue(1), Val.getValue(0),
5224 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5225 }
5226 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5227 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5228 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5229 };
5230
5231 if (Op.getValueType() == MVT::bf16) {
5232 unsigned MaxWidth = IsSigned
5233 ? DAG.ComputeMaxSignificantBits(SrcVal)
5234 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5235 // bf16 conversions are promoted to f32 when converting from i16.
5236 if (MaxWidth <= 24) {
5237 return IntToFpViaPromotion(MVT::f32);
5238 }
5239
5240 // bf16 conversions are promoted to f64 when converting from i32.
5241 if (MaxWidth <= 53) {
5242 return IntToFpViaPromotion(MVT::f64);
5243 }
5244
5245 // We need to be careful about i64 -> bf16.
5246 // Consider an i32 22216703.
5247 // This number cannot be represented exactly as an f32 and so a itofp will
5248 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5249 // However, the correct bf16 was supposed to be 22151168.0
5250 // We need to use sticky rounding to get this correct.
5251 if (SrcVal.getValueType() == MVT::i64) {
5252 SDLoc DL(Op);
5253 // This algorithm is equivalent to the following:
5254 // uint64_t SrcHi = SrcVal & ~0xfffull;
5255 // uint64_t SrcLo = SrcVal & 0xfffull;
5256 // uint64_t Highest = SrcVal >> 53;
5257 // bool HasHighest = Highest != 0;
5258 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5259 // double Rounded = static_cast<double>(ToRound);
5260 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5261 // uint64_t HasLo = SrcLo != 0;
5262 // bool NeedsAdjustment = HasHighest & HasLo;
5263 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5264 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5265 // return static_cast<__bf16>(Adjusted);
5266 //
5267 // Essentially, what happens is that SrcVal either fits perfectly in a
5268 // double-precision value or it is too big. If it is sufficiently small,
5269 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5270 // ensure that u64 -> double has no rounding error by only using the 52
5271 // MSB of the input. The low order bits will get merged into a sticky bit
5272 // which will avoid issues incurred by double rounding.
5273
5274 // Signed conversion is more or less like so:
5275 // copysign((__bf16)abs(SrcVal), SrcVal)
5276 SDValue SignBit;
5277 if (IsSigned) {
5278 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5279 DAG.getConstant(1ull << 63, DL, MVT::i64));
5280 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5281 }
5282 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5283 DAG.getConstant(~0xfffull, DL, MVT::i64));
5284 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5285 DAG.getConstant(0xfffull, DL, MVT::i64));
5287 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5288 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5289 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5290 SDValue ToRound =
5291 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5292 SDValue Rounded =
5293 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5294 {Op.getOperand(0), ToRound})
5295 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5296
5297 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5298 if (SignBit) {
5299 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5300 }
5301
5302 SDValue HasHighest = DAG.getSetCC(
5303 DL,
5304 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5305 Highest, Zero64, ISD::SETNE);
5306
5307 SDValue HasLo = DAG.getSetCC(
5308 DL,
5309 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5310 SrcLo, Zero64, ISD::SETNE);
5311
5312 SDValue NeedsAdjustment =
5313 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5314 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5315
5316 SDValue AdjustedBits =
5317 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5318 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5319 return IsStrict
5320 ? DAG.getNode(
5322 {Op.getValueType(), MVT::Other},
5323 {Rounded.getValue(1), Adjusted,
5324 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5325 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5326 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5327 }
5328 }
5329
5330 // f16 conversions are promoted to f32 when full fp16 is not supported.
5331 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5332 return IntToFpViaPromotion(MVT::f32);
5333 }
5334
5335 // i128 conversions are libcalls.
5336 if (SrcVal.getValueType() == MVT::i128)
5337 return SDValue();
5338
5339 // Other conversions are legal, unless it's to the completely software-based
5340 // fp128.
5341 if (Op.getValueType() != MVT::f128)
5342 return Op;
5343 return SDValue();
5344}
5345
5346SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5347 SelectionDAG &DAG) const {
5348 // For iOS, we want to call an alternative entry point: __sincos_stret,
5349 // which returns the values in two S / D registers.
5350 SDLoc DL(Op);
5351 SDValue Arg = Op.getOperand(0);
5352 EVT ArgVT = Arg.getValueType();
5353 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5354
5356 Args.emplace_back(Arg, ArgTy);
5357
5358 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5359 : RTLIB::SINCOS_STRET_F32;
5360 const char *LibcallName = getLibcallName(LC);
5361 SDValue Callee =
5362 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5363
5364 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5365 TargetLowering::CallLoweringInfo CLI(DAG);
5367 CLI.setDebugLoc(DL)
5368 .setChain(DAG.getEntryNode())
5369 .setLibCallee(CC, RetTy, Callee, std::move(Args));
5370
5371 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5372 return CallResult.first;
5373}
5374
5375static MVT getSVEContainerType(EVT ContentTy);
5376
5377SDValue
5378AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5379 SelectionDAG &DAG) const {
5380 SDLoc DL(Op);
5381 uint64_t EltSize = Op.getConstantOperandVal(2);
5382 EVT VT = Op.getValueType();
5383 switch (EltSize) {
5384 case 1:
5385 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5386 return SDValue();
5387 break;
5388 case 2:
5389 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5390 return SDValue();
5391 break;
5392 case 4:
5393 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5394 return SDValue();
5395 break;
5396 case 8:
5397 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5398 return SDValue();
5399 break;
5400 default:
5401 // Other element sizes are incompatible with whilewr/rw, so expand instead
5402 return SDValue();
5403 }
5404
5405 SDValue PtrA = Op.getOperand(0);
5406 SDValue PtrB = Op.getOperand(1);
5407
5408 if (VT.isScalableVT())
5409 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
5410
5411 // We can use the SVE whilewr/whilerw instruction to lower this
5412 // intrinsic by creating the appropriate sequence of scalable vector
5413 // operations and then extracting a fixed-width subvector from the scalable
5414 // vector. Scalable vector variants are already legal.
5415 EVT ContainerVT =
5417 VT.getVectorNumElements(), true);
5418 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5419
5420 SDValue Mask =
5421 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
5422 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5423 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5424 DAG.getVectorIdxConstant(0, DL));
5425}
5426
5427SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5428 SelectionDAG &DAG) const {
5429 EVT OpVT = Op.getValueType();
5430 EVT ArgVT = Op.getOperand(0).getValueType();
5431
5433 return LowerFixedLengthBitcastToSVE(Op, DAG);
5434
5435 if (OpVT.isScalableVector()) {
5436 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5437
5438 // Handle type legalisation first.
5439 if (!isTypeLegal(ArgVT)) {
5440 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5441 "Expected int->fp bitcast!");
5442
5443 // Bitcasting between unpacked vector types of different element counts is
5444 // not a NOP because the live elements are laid out differently.
5445 // 01234567
5446 // e.g. nxv2i32 = XX??XX??
5447 // nxv4f16 = X?X?X?X?
5448 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5449 return SDValue();
5450
5451 SDValue ExtResult =
5452 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5453 Op.getOperand(0));
5454 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5455 }
5456
5457 // Bitcasts between legal types with the same element count are legal.
5458 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5459 return Op;
5460
5461 // getSVESafeBitCast does not support casting between unpacked types.
5462 if (!isPackedVectorType(OpVT, DAG))
5463 return SDValue();
5464
5465 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5466 }
5467
5468 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5469 return SDValue();
5470
5471 // Bitcasts between f16 and bf16 are legal.
5472 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5473 return Op;
5474
5475 assert(ArgVT == MVT::i16);
5476 SDLoc DL(Op);
5477
5478 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5479 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5480 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5481}
5482
5483// Returns lane if Op extracts from a two-element vector and lane is constant
5484// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5485static std::optional<uint64_t>
5487 SDNode *OpNode = Op.getNode();
5488 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5489 return std::nullopt;
5490
5491 EVT VT = OpNode->getOperand(0).getValueType();
5493 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5494 return std::nullopt;
5495
5496 return C->getZExtValue();
5497}
5498
5500 bool isSigned) {
5501 EVT VT = N.getValueType();
5502
5503 if (N.getOpcode() != ISD::BUILD_VECTOR)
5504 return false;
5505
5506 for (const SDValue &Elt : N->op_values()) {
5508 unsigned EltSize = VT.getScalarSizeInBits();
5509 unsigned HalfSize = EltSize / 2;
5510 if (isSigned) {
5511 if (!isIntN(HalfSize, C->getSExtValue()))
5512 return false;
5513 } else {
5514 if (!isUIntN(HalfSize, C->getZExtValue()))
5515 return false;
5516 }
5517 continue;
5518 }
5519 return false;
5520 }
5521
5522 return true;
5523}
5524
5526 EVT VT = N.getValueType();
5527 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5528 EVT HalfVT = EVT::getVectorVT(
5529 *DAG.getContext(),
5532 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5533}
5534
5536 return N.getOpcode() == ISD::SIGN_EXTEND ||
5537 N.getOpcode() == ISD::ANY_EXTEND ||
5538 isExtendedBUILD_VECTOR(N, DAG, true);
5539}
5540
5542 return N.getOpcode() == ISD::ZERO_EXTEND ||
5543 N.getOpcode() == ISD::ANY_EXTEND ||
5544 isExtendedBUILD_VECTOR(N, DAG, false);
5545}
5546
5548 unsigned Opcode = N.getOpcode();
5549 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5550 SDValue N0 = N.getOperand(0);
5551 SDValue N1 = N.getOperand(1);
5552 return N0->hasOneUse() && N1->hasOneUse() &&
5553 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5554 }
5555 return false;
5556}
5557
5559 unsigned Opcode = N.getOpcode();
5560 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5561 SDValue N0 = N.getOperand(0);
5562 SDValue N1 = N.getOperand(1);
5563 return N0->hasOneUse() && N1->hasOneUse() &&
5564 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5565 }
5566 return false;
5567}
5568
5569SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5570 SelectionDAG &DAG) const {
5571 // The rounding mode is in bits 23:22 of the FPSCR.
5572 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5573 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5574 // so that the shift + and get folded into a bitfield extract.
5575 SDLoc DL(Op);
5576
5577 SDValue Chain = Op.getOperand(0);
5578 SDValue FPCR_64 = DAG.getNode(
5579 ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5580 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
5581 Chain = FPCR_64.getValue(1);
5582 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5583 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5584 DAG.getConstant(1U << 22, DL, MVT::i32));
5585 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5586 DAG.getConstant(22, DL, MVT::i32));
5587 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5588 DAG.getConstant(3, DL, MVT::i32));
5589 return DAG.getMergeValues({AND, Chain}, DL);
5590}
5591
5592SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5593 SelectionDAG &DAG) const {
5594 SDLoc DL(Op);
5595 SDValue Chain = Op->getOperand(0);
5596 SDValue RMValue = Op->getOperand(1);
5597
5598 // The rounding mode is in bits 23:22 of the FPCR.
5599 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5600 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5601 // ((arg - 1) & 3) << 22).
5602 //
5603 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5604 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5605 // generated llvm.set.rounding to ensure this condition.
5606
5607 // Calculate new value of FPCR[23:22].
5608 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5609 DAG.getConstant(1, DL, MVT::i32));
5610 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5611 DAG.getConstant(0x3, DL, MVT::i32));
5612 RMValue =
5613 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5614 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5615 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5616
5617 // Get current value of FPCR.
5618 SDValue Ops[] = {
5619 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5620 SDValue FPCR =
5621 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5622 Chain = FPCR.getValue(1);
5623 FPCR = FPCR.getValue(0);
5624
5625 // Put new rounding mode into FPSCR[23:22].
5626 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5627 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5628 DAG.getConstant(RMMask, DL, MVT::i64));
5629 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5630 SDValue Ops2[] = {
5631 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5632 FPCR};
5633 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5634}
5635
5636SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5637 SelectionDAG &DAG) const {
5638 SDLoc DL(Op);
5639 SDValue Chain = Op->getOperand(0);
5640
5641 // Get current value of FPCR.
5642 SDValue Ops[] = {
5643 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5644 SDValue FPCR =
5645 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5646 Chain = FPCR.getValue(1);
5647 FPCR = FPCR.getValue(0);
5648
5649 // Truncate FPCR to 32 bits.
5650 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5651
5652 return DAG.getMergeValues({Result, Chain}, DL);
5653}
5654
5655SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5656 SelectionDAG &DAG) const {
5657 SDLoc DL(Op);
5658 SDValue Chain = Op->getOperand(0);
5659 SDValue Mode = Op->getOperand(1);
5660
5661 // Extend the specified value to 64 bits.
5662 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5663
5664 // Set new value of FPCR.
5665 SDValue Ops2[] = {
5666 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5667 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5668}
5669
5670SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5671 SelectionDAG &DAG) const {
5672 SDLoc DL(Op);
5673 SDValue Chain = Op->getOperand(0);
5674
5675 // Get current value of FPCR.
5676 SDValue Ops[] = {
5677 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5678 SDValue FPCR =
5679 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5680 Chain = FPCR.getValue(1);
5681 FPCR = FPCR.getValue(0);
5682
5683 // Clear bits that are not reserved.
5684 SDValue FPSCRMasked = DAG.getNode(
5685 ISD::AND, DL, MVT::i64, FPCR,
5687
5688 // Set new value of FPCR.
5689 SDValue Ops2[] = {Chain,
5690 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5691 FPSCRMasked};
5692 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5693}
5694
5695static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5696 SDLoc DL, bool &IsMLA) {
5697 bool IsN0SExt = isSignExtended(N0, DAG);
5698 bool IsN1SExt = isSignExtended(N1, DAG);
5699 if (IsN0SExt && IsN1SExt)
5700 return AArch64ISD::SMULL;
5701
5702 bool IsN0ZExt = isZeroExtended(N0, DAG);
5703 bool IsN1ZExt = isZeroExtended(N1, DAG);
5704
5705 if (IsN0ZExt && IsN1ZExt)
5706 return AArch64ISD::UMULL;
5707
5708 // Select UMULL if we can replace the other operand with an extend.
5709 EVT VT = N0.getValueType();
5710 unsigned EltSize = VT.getScalarSizeInBits();
5711 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5712 if (IsN0ZExt || IsN1ZExt) {
5713 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5714 return AArch64ISD::UMULL;
5715 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5716 DAG.MaskedValueIsZero(N1, Mask)) {
5717 // For v2i64 we look more aggressively at both operands being zero, to avoid
5718 // scalarization.
5719 return AArch64ISD::UMULL;
5720 }
5721
5722 if (IsN0SExt || IsN1SExt) {
5723 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5724 return AArch64ISD::SMULL;
5725 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5726 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5727 return AArch64ISD::SMULL;
5728 }
5729
5730 if (!IsN1SExt && !IsN1ZExt)
5731 return 0;
5732
5733 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5734 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5735 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5736 IsMLA = true;
5737 return AArch64ISD::SMULL;
5738 }
5739 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5740 IsMLA = true;
5741 return AArch64ISD::UMULL;
5742 }
5743 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5744 std::swap(N0, N1);
5745 IsMLA = true;
5746 return AArch64ISD::UMULL;
5747 }
5748 return 0;
5749}
5750
5751SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5752 EVT VT = Op.getValueType();
5753
5754 bool OverrideNEON = !Subtarget->isNeonAvailable();
5755 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5756 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5757
5758 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5759 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5760 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5761 "unexpected type for custom-lowering ISD::MUL");
5762 SDValue N0 = Op.getOperand(0);
5763 SDValue N1 = Op.getOperand(1);
5764 bool isMLA = false;
5765 EVT OVT = VT;
5766 if (VT.is64BitVector()) {
5767 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5768 isNullConstant(N0.getOperand(1)) &&
5770 isNullConstant(N1.getOperand(1))) {
5771 N0 = N0.getOperand(0);
5772 N1 = N1.getOperand(0);
5773 VT = N0.getValueType();
5774 } else {
5775 if (VT == MVT::v1i64) {
5776 if (Subtarget->hasSVE())
5777 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5778 // Fall through to expand this. It is not legal.
5779 return SDValue();
5780 } else
5781 // Other vector multiplications are legal.
5782 return Op;
5783 }
5784 }
5785
5786 SDLoc DL(Op);
5787 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5788
5789 if (!NewOpc) {
5790 if (VT.getVectorElementType() == MVT::i64) {
5791 // If SVE is available then i64 vector multiplications can also be made
5792 // legal.
5793 if (Subtarget->hasSVE())
5794 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5795 // Fall through to expand this. It is not legal.
5796 return SDValue();
5797 } else
5798 // Other vector multiplications are legal.
5799 return Op;
5800 }
5801
5802 // Legalize to a S/UMULL instruction
5803 SDValue Op0;
5804 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5805 if (!isMLA) {
5806 Op0 = skipExtensionForVectorMULL(N0, DAG);
5808 Op1.getValueType().is64BitVector() &&
5809 "unexpected types for extended operands to VMULL");
5810 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5811 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5812 DAG.getConstant(0, DL, MVT::i64));
5813 }
5814 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5815 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5816 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5819 EVT Op1VT = Op1.getValueType();
5820 return DAG.getNode(
5822 DAG.getNode(N0.getOpcode(), DL, VT,
5823 DAG.getNode(NewOpc, DL, VT,
5824 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5825 DAG.getNode(NewOpc, DL, VT,
5826 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5827 DAG.getConstant(0, DL, MVT::i64));
5828}
5829
5830static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5831 int Pattern) {
5832 if (Pattern == AArch64SVEPredPattern::all)
5833 return DAG.getConstant(1, DL, VT);
5834 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5835 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5836}
5837
5839 bool IsSigned, bool IsEqual) {
5840 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5841 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5842
5843 if (!N->getValueType(0).isScalableVector() ||
5844 !isa<ConstantSDNode>(N->getOperand(Op1)))
5845 return SDValue();
5846
5847 SDLoc DL(N);
5848 APInt Y = N->getConstantOperandAPInt(Op1);
5849
5850 // When the second operand is the maximum value, comparisons that include
5851 // equality can never fail and thus we can return an all active predicate.
5852 if (IsEqual)
5853 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5854 return DAG.getConstant(1, DL, N->getValueType(0));
5855
5856 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5857 return SDValue();
5858
5859 APInt X = N->getConstantOperandAPInt(Op0);
5860
5861 bool Overflow;
5862 APInt NumActiveElems =
5863 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5864
5865 if (Overflow)
5866 return SDValue();
5867
5868 if (IsEqual) {
5869 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5870 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5871 : NumActiveElems.uadd_ov(One, Overflow);
5872 if (Overflow)
5873 return SDValue();
5874 }
5875
5876 std::optional<unsigned> PredPattern =
5878 unsigned MinSVEVectorSize = std::max(
5880 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5881 if (PredPattern != std::nullopt &&
5882 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5883 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5884
5885 return SDValue();
5886}
5887
5888// Returns a safe bitcast between two scalable vector predicates, where
5889// any newly created lanes from a widening bitcast are defined as zero.
5891 SDLoc DL(Op);
5892 EVT InVT = Op.getValueType();
5893
5894 assert(InVT.getVectorElementType() == MVT::i1 &&
5895 VT.getVectorElementType() == MVT::i1 &&
5896 "Expected a predicate-to-predicate bitcast");
5898 InVT.isScalableVector() &&
5899 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5900 "Only expect to cast between legal scalable predicate types!");
5901
5902 // Return the operand if the cast isn't changing type,
5903 if (InVT == VT)
5904 return Op;
5905
5906 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5907 // than VT. This will increase the chances of removing casts that introduce
5908 // new lanes, which have to be explicitly zero'd.
5909 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5910 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5911 Op.getOperand(1).getValueType().bitsGT(VT))
5912 Op = Op.getOperand(1);
5913
5914 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5915
5916 // We only have to zero the lanes if new lanes are being defined, e.g. when
5917 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5918 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5919 // we can return here.
5920 if (InVT.bitsGT(VT))
5921 return Reinterpret;
5922
5923 // Check if the other lanes are already known to be zeroed by
5924 // construction.
5926 return Reinterpret;
5927
5928 // Zero the newly introduced lanes.
5929 SDValue Mask = DAG.getConstant(1, DL, InVT);
5930 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5931 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5932}
5933
5934SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5935 SDValue Chain, SDLoc DL,
5936 EVT VT) const {
5937 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5940 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5941 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5942 TargetLowering::CallLoweringInfo CLI(DAG);
5944 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5945 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5946 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5947 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5948 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5949 Mask);
5950}
5951
5952// Lower an SME LDR/STR ZA intrinsic
5953// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5954// folded into the instruction
5955// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5956// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5957// and tile slice registers
5958// ldr(%tileslice, %ptr, %vecnum)
5959// ->
5960// %svl = rdsvl
5961// %ptr2 = %ptr + %svl * %vecnum
5962// %tileslice2 = %tileslice + %vecnum
5963// ldr [%tileslice2, 0], [%ptr2, 0]
5964// Case 3: If the vecnum is an immediate out of range, then the same is done as
5965// case 2, but the base and slice registers are modified by the greatest
5966// multiple of 15 lower than the vecnum and the remainder is folded into the
5967// instruction. This means that successive loads and stores that are offset from
5968// each other can share the same base and slice register updates.
5969// ldr(%tileslice, %ptr, 22)
5970// ldr(%tileslice, %ptr, 23)
5971// ->
5972// %svl = rdsvl
5973// %ptr2 = %ptr + %svl * 15
5974// %tileslice2 = %tileslice + 15
5975// ldr [%tileslice2, 7], [%ptr2, 7]
5976// ldr [%tileslice2, 8], [%ptr2, 8]
5977// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5978// operand and the immediate can be folded into the instruction, like case 2.
5979// ldr(%tileslice, %ptr, %vecnum + 7)
5980// ldr(%tileslice, %ptr, %vecnum + 8)
5981// ->
5982// %svl = rdsvl
5983// %ptr2 = %ptr + %svl * %vecnum
5984// %tileslice2 = %tileslice + %vecnum
5985// ldr [%tileslice2, 7], [%ptr2, 7]
5986// ldr [%tileslice2, 8], [%ptr2, 8]
5987// Case 5: The vecnum being an add of an immediate out of range is also handled,
5988// in which case the same remainder logic as case 3 is used.
5990 SDLoc DL(N);
5991
5992 SDValue TileSlice = N->getOperand(2);
5993 SDValue Base = N->getOperand(3);
5994 SDValue VecNum = N->getOperand(4);
5995 int32_t ConstAddend = 0;
5996 SDValue VarAddend = VecNum;
5997
5998 // If the vnum is an add of an immediate, we can fold it into the instruction
5999 if (VecNum.getOpcode() == ISD::ADD &&
6000 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6001 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6002 VarAddend = VecNum.getOperand(0);
6003 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6004 ConstAddend = ImmNode->getSExtValue();
6005 VarAddend = SDValue();
6006 }
6007
6008 int32_t ImmAddend = ConstAddend % 16;
6009 if (int32_t C = (ConstAddend - ImmAddend)) {
6010 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6011 VarAddend = VarAddend
6012 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6013 : CVal;
6014 }
6015
6016 if (VarAddend) {
6017 // Get the vector length that will be multiplied by vnum
6018 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6019 DAG.getConstant(1, DL, MVT::i32));
6020
6021 // Multiply SVL and vnum then add it to the base
6022 SDValue Mul = DAG.getNode(
6023 ISD::MUL, DL, MVT::i64,
6024 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6025 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6026 // Just add vnum to the tileslice
6027 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6028 }
6029
6030 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6031 DL, MVT::Other,
6032 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6033 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6034}
6035
6037 SDLoc DL(Op);
6038 SDValue ID =
6039 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6040
6041 auto Op1 = Op.getOperand(1);
6042 auto Op2 = Op.getOperand(2);
6043 auto Mask = Op.getOperand(3);
6044
6045 EVT Op1VT = Op1.getValueType();
6046 EVT Op2VT = Op2.getValueType();
6047 EVT ResVT = Op.getValueType();
6048
6049 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6050 Op1VT.getVectorElementType() == MVT::i16) &&
6051 "Expected 8-bit or 16-bit characters.");
6052
6053 // Scalable vector type used to wrap operands.
6054 // A single container is enough for both operands because ultimately the
6055 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6056 EVT OpContainerVT = Op1VT.isScalableVector()
6057 ? Op1VT
6059
6060 if (Op2VT.is128BitVector()) {
6061 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6062 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6063 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6064 if (ResVT.isScalableVector())
6065 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6066 DAG.getTargetConstant(0, DL, MVT::i64));
6067 } else {
6068 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6069 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6070 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6071 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6072 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6073 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6074 DAG.getConstant(0, DL, MVT::i64));
6075 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6076 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6077 }
6078
6079 // If the result is scalable, we just need to carry out the MATCH.
6080 if (ResVT.isScalableVector())
6081 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6082
6083 // If the result is fixed, we can still use MATCH but we need to wrap the
6084 // first operand and the mask in scalable vectors before doing so.
6085
6086 // Wrap the operands.
6087 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6088 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6089 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6090
6091 // Carry out the match.
6092 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6093 ID, Mask, Op1, Op2);
6094
6095 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6096 // (v16i8/v8i8).
6097 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6098 Match = convertFromScalableVector(DAG, Op1VT, Match);
6099 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6100}
6101
6102SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6103 SelectionDAG &DAG) const {
6104 unsigned IntNo = Op.getConstantOperandVal(1);
6105 SDLoc DL(Op);
6106 switch (IntNo) {
6107 default:
6108 return SDValue(); // Don't custom lower most intrinsics.
6109 case Intrinsic::aarch64_prefetch: {
6110 SDValue Chain = Op.getOperand(0);
6111 SDValue Addr = Op.getOperand(2);
6112
6113 unsigned IsWrite = Op.getConstantOperandVal(3);
6114 unsigned Locality = Op.getConstantOperandVal(4);
6115 unsigned IsStream = Op.getConstantOperandVal(5);
6116 unsigned IsData = Op.getConstantOperandVal(6);
6117 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6118 (!IsData << 3) | // IsDataCache bit
6119 (Locality << 1) | // Cache level bits
6120 (unsigned)IsStream; // Stream bit
6121
6122 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6123 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6124 }
6125 case Intrinsic::aarch64_sme_str:
6126 case Intrinsic::aarch64_sme_ldr: {
6127 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6128 }
6129 case Intrinsic::aarch64_sme_za_enable:
6130 return DAG.getNode(
6131 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6132 Op->getOperand(0), // Chain
6133 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6134 case Intrinsic::aarch64_sme_za_disable:
6135 return DAG.getNode(
6136 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6137 Op->getOperand(0), // Chain
6138 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6139 }
6140}
6141
6142SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6143 SelectionDAG &DAG) const {
6144 unsigned IntNo = Op.getConstantOperandVal(1);
6145 SDLoc DL(Op);
6146 switch (IntNo) {
6147 default:
6148 return SDValue(); // Don't custom lower most intrinsics.
6149 case Intrinsic::aarch64_mops_memset_tag: {
6150 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6151 SDValue Chain = Node->getChain();
6152 SDValue Dst = Op.getOperand(2);
6153 SDValue Val = Op.getOperand(3);
6154 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6155 SDValue Size = Op.getOperand(4);
6156 auto Alignment = Node->getMemOperand()->getAlign();
6157 bool IsVol = Node->isVolatile();
6158 auto DstPtrInfo = Node->getPointerInfo();
6159
6160 const auto &SDI =
6161 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6162 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6163 Chain, Dst, Val, Size, Alignment, IsVol,
6164 DstPtrInfo, MachinePointerInfo{});
6165
6166 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6167 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6168 // LowerOperationWrapper will complain that the number of results has
6169 // changed.
6170 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6171 }
6172 }
6173}
6174
6175SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6176 SelectionDAG &DAG) const {
6177 unsigned IntNo = Op.getConstantOperandVal(0);
6178 SDLoc DL(Op);
6179 switch (IntNo) {
6180 default: return SDValue(); // Don't custom lower most intrinsics.
6181 case Intrinsic::thread_pointer: {
6182 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6183 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6184 }
6185 case Intrinsic::aarch64_sve_whilewr_b:
6186 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6187 Op.getOperand(1), Op.getOperand(2),
6188 DAG.getConstant(1, DL, MVT::i64));
6189 case Intrinsic::aarch64_sve_whilewr_h:
6190 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6191 Op.getOperand(1), Op.getOperand(2),
6192 DAG.getConstant(2, DL, MVT::i64));
6193 case Intrinsic::aarch64_sve_whilewr_s:
6194 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6195 Op.getOperand(1), Op.getOperand(2),
6196 DAG.getConstant(4, DL, MVT::i64));
6197 case Intrinsic::aarch64_sve_whilewr_d:
6198 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6199 Op.getOperand(1), Op.getOperand(2),
6200 DAG.getConstant(8, DL, MVT::i64));
6201 case Intrinsic::aarch64_sve_whilerw_b:
6202 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6203 Op.getOperand(1), Op.getOperand(2),
6204 DAG.getConstant(1, DL, MVT::i64));
6205 case Intrinsic::aarch64_sve_whilerw_h:
6206 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6207 Op.getOperand(1), Op.getOperand(2),
6208 DAG.getConstant(2, DL, MVT::i64));
6209 case Intrinsic::aarch64_sve_whilerw_s:
6210 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6211 Op.getOperand(1), Op.getOperand(2),
6212 DAG.getConstant(4, DL, MVT::i64));
6213 case Intrinsic::aarch64_sve_whilerw_d:
6214 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6215 Op.getOperand(1), Op.getOperand(2),
6216 DAG.getConstant(8, DL, MVT::i64));
6217 case Intrinsic::aarch64_neon_abs: {
6218 EVT Ty = Op.getValueType();
6219 if (Ty == MVT::i64) {
6220 SDValue Result =
6221 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6222 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6223 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6224 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6225 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6226 } else {
6227 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6228 }
6229 }
6230 case Intrinsic::aarch64_neon_pmull64: {
6231 SDValue LHS = Op.getOperand(1);
6232 SDValue RHS = Op.getOperand(2);
6233
6234 std::optional<uint64_t> LHSLane =
6236 std::optional<uint64_t> RHSLane =
6238
6239 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6240 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6241
6242 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6243 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6244 // which ISel recognizes better. For example, generate a ldr into d*
6245 // registers as opposed to a GPR load followed by a fmov.
6246 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6247 std::optional<uint64_t> OtherLane,
6248 const SDLoc &DL,
6249 SelectionDAG &DAG) -> SDValue {
6250 // If the operand is an higher half itself, rewrite it to
6251 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6252 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6253 if (NLane == 1)
6254 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6255 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6256
6257 // Operand N is not a higher half but the other operand is.
6258 if (OtherLane == 1) {
6259 // If this operand is a lower half, rewrite it to
6260 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6261 // align lanes of two operands. A roundtrip sequence (to move from lane
6262 // 1 to lane 0) is like this:
6263 // mov x8, v0.d[1]
6264 // fmov d0, x8
6265 if (NLane == 0)
6266 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6267 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6268 N.getOperand(0),
6269 DAG.getConstant(0, DL, MVT::i64)),
6270 DAG.getConstant(1, DL, MVT::i64));
6271
6272 // Otherwise just dup from main to all lanes.
6273 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6274 }
6275
6276 // Neither operand is an extract of higher half, so codegen may just use
6277 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6278 assert(N.getValueType() == MVT::i64 &&
6279 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6280 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6281 };
6282
6283 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6284 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6285
6286 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6287 }
6288 case Intrinsic::aarch64_neon_smax:
6289 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6290 Op.getOperand(2));
6291 case Intrinsic::aarch64_neon_umax:
6292 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6293 Op.getOperand(2));
6294 case Intrinsic::aarch64_neon_smin:
6295 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6296 Op.getOperand(2));
6297 case Intrinsic::aarch64_neon_umin:
6298 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6299 Op.getOperand(2));
6300 case Intrinsic::aarch64_neon_scalar_sqxtn:
6301 case Intrinsic::aarch64_neon_scalar_sqxtun:
6302 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6303 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6304 if (Op.getValueType() == MVT::i32)
6305 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6306 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6307 Op.getOperand(0),
6308 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6309 Op.getOperand(1))));
6310 return SDValue();
6311 }
6312 case Intrinsic::aarch64_neon_sqxtn:
6313 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6314 Op.getOperand(1));
6315 case Intrinsic::aarch64_neon_sqxtun:
6316 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6317 Op.getOperand(1));
6318 case Intrinsic::aarch64_neon_uqxtn:
6319 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6320 Op.getOperand(1));
6321 case Intrinsic::aarch64_neon_sqshrn:
6322 if (Op.getValueType().isVector())
6323 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6324 DAG.getNode(AArch64ISD::VASHR, DL,
6325 Op.getOperand(1).getValueType(),
6326 Op.getOperand(1), Op.getOperand(2)));
6327 return SDValue();
6328 case Intrinsic::aarch64_neon_sqshrun:
6329 if (Op.getValueType().isVector())
6330 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6331 DAG.getNode(AArch64ISD::VASHR, DL,
6332 Op.getOperand(1).getValueType(),
6333 Op.getOperand(1), Op.getOperand(2)));
6334 return SDValue();
6335 case Intrinsic::aarch64_neon_uqshrn:
6336 if (Op.getValueType().isVector())
6337 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6338 DAG.getNode(AArch64ISD::VLSHR, DL,
6339 Op.getOperand(1).getValueType(),
6340 Op.getOperand(1), Op.getOperand(2)));
6341 return SDValue();
6342 case Intrinsic::aarch64_neon_sqrshrn:
6343 if (Op.getValueType().isVector())
6344 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6345 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6346 Op.getOperand(1).getValueType(),
6347 Op.getOperand(1), Op.getOperand(2)));
6348 return SDValue();
6349 case Intrinsic::aarch64_neon_sqrshrun:
6350 if (Op.getValueType().isVector())
6351 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6352 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6353 Op.getOperand(1).getValueType(),
6354 Op.getOperand(1), Op.getOperand(2)));
6355 return SDValue();
6356 case Intrinsic::aarch64_neon_uqrshrn:
6357 if (Op.getValueType().isVector())
6358 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6359 DAG.getNode(AArch64ISD::URSHR_I, DL,
6360 Op.getOperand(1).getValueType(),
6361 Op.getOperand(1), Op.getOperand(2)));
6362 return SDValue();
6363 case Intrinsic::aarch64_neon_sqadd:
6364 if (Op.getValueType().isVector())
6365 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6366 Op.getOperand(2));
6367 return SDValue();
6368 case Intrinsic::aarch64_neon_sqsub:
6369 if (Op.getValueType().isVector())
6370 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6371 Op.getOperand(2));
6372 return SDValue();
6373 case Intrinsic::aarch64_neon_uqadd:
6374 if (Op.getValueType().isVector())
6375 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6376 Op.getOperand(2));
6377 return SDValue();
6378 case Intrinsic::aarch64_neon_uqsub:
6379 if (Op.getValueType().isVector())
6380 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6381 Op.getOperand(2));
6382 return SDValue();
6383 case Intrinsic::aarch64_sve_whilelt:
6384 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6385 /*IsEqual=*/false);
6386 case Intrinsic::aarch64_sve_whilels:
6387 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6388 /*IsEqual=*/true);
6389 case Intrinsic::aarch64_sve_whilele:
6390 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6391 /*IsEqual=*/true);
6392 case Intrinsic::aarch64_sve_sunpkhi:
6393 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6394 Op.getOperand(1));
6395 case Intrinsic::aarch64_sve_sunpklo:
6396 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6397 Op.getOperand(1));
6398 case Intrinsic::aarch64_sve_uunpkhi:
6399 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6400 Op.getOperand(1));
6401 case Intrinsic::aarch64_sve_uunpklo:
6402 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6403 Op.getOperand(1));
6404 case Intrinsic::aarch64_sve_clasta_n:
6405 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6406 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6407 case Intrinsic::aarch64_sve_clastb_n:
6408 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6409 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6410 case Intrinsic::aarch64_sve_lasta:
6411 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6412 Op.getOperand(1), Op.getOperand(2));
6413 case Intrinsic::aarch64_sve_lastb:
6414 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6415 Op.getOperand(1), Op.getOperand(2));
6416 case Intrinsic::aarch64_sve_rev:
6417 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6418 Op.getOperand(1));
6419 case Intrinsic::aarch64_sve_tbl:
6420 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6421 Op.getOperand(2));
6422 case Intrinsic::aarch64_sve_trn1:
6423 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6424 Op.getOperand(1), Op.getOperand(2));
6425 case Intrinsic::aarch64_sve_trn2:
6426 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6427 Op.getOperand(1), Op.getOperand(2));
6428 case Intrinsic::aarch64_sve_uzp1:
6429 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6430 Op.getOperand(1), Op.getOperand(2));
6431 case Intrinsic::aarch64_sve_uzp2:
6432 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6433 Op.getOperand(1), Op.getOperand(2));
6434 case Intrinsic::aarch64_sve_zip1:
6435 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6436 Op.getOperand(1), Op.getOperand(2));
6437 case Intrinsic::aarch64_sve_zip2:
6438 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6439 Op.getOperand(1), Op.getOperand(2));
6440 case Intrinsic::aarch64_sve_splice:
6441 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6442 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6443 case Intrinsic::aarch64_sve_ptrue:
6444 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6445 case Intrinsic::aarch64_sve_clz:
6446 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6447 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6448 case Intrinsic::aarch64_sme_cntsd: {
6449 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6450 DAG.getConstant(1, DL, MVT::i32));
6451 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6452 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6453 }
6454 case Intrinsic::aarch64_sve_cnt: {
6455 SDValue Data = Op.getOperand(3);
6456 // CTPOP only supports integer operands.
6457 if (Data.getValueType().isFloatingPoint())
6458 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6459 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6460 Op.getOperand(2), Data, Op.getOperand(1));
6461 }
6462 case Intrinsic::aarch64_sve_dupq_lane:
6463 return LowerDUPQLane(Op, DAG);
6464 case Intrinsic::aarch64_sve_convert_from_svbool:
6465 if (Op.getValueType() == MVT::aarch64svcount)
6466 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6467 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6468 case Intrinsic::aarch64_sve_convert_to_svbool:
6469 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6470 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6471 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6472 case Intrinsic::aarch64_sve_fneg:
6473 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6474 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6475 case Intrinsic::aarch64_sve_frintp:
6476 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6477 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6478 case Intrinsic::aarch64_sve_frintm:
6479 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6480 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6481 case Intrinsic::aarch64_sve_frinti:
6482 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6483 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6484 Op.getOperand(1));
6485 case Intrinsic::aarch64_sve_frintx:
6486 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6487 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6488 case Intrinsic::aarch64_sve_frinta:
6489 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6490 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6491 case Intrinsic::aarch64_sve_frintn:
6492 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6493 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6494 Op.getOperand(1));
6495 case Intrinsic::aarch64_sve_frintz:
6496 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6497 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6498 case Intrinsic::aarch64_sve_ucvtf:
6499 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6500 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6501 Op.getOperand(1));
6502 case Intrinsic::aarch64_sve_scvtf:
6503 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6504 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6505 Op.getOperand(1));
6506 case Intrinsic::aarch64_sve_fcvtzu:
6507 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6508 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6509 case Intrinsic::aarch64_sve_fcvtzs:
6510 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6511 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6512 case Intrinsic::aarch64_sve_fsqrt:
6513 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6514 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6515 case Intrinsic::aarch64_sve_frecpx:
6516 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6517 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6518 case Intrinsic::aarch64_sve_frecpe_x:
6519 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6520 Op.getOperand(1));
6521 case Intrinsic::aarch64_sve_frecps_x:
6522 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6523 Op.getOperand(1), Op.getOperand(2));
6524 case Intrinsic::aarch64_sve_frsqrte_x:
6525 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6526 Op.getOperand(1));
6527 case Intrinsic::aarch64_sve_frsqrts_x:
6528 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6529 Op.getOperand(1), Op.getOperand(2));
6530 case Intrinsic::aarch64_sve_fabs:
6531 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6532 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6533 case Intrinsic::aarch64_sve_abs:
6534 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6535 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6536 case Intrinsic::aarch64_sve_neg:
6537 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6538 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6539 case Intrinsic::aarch64_sve_insr: {
6540 SDValue Scalar = Op.getOperand(2);
6541 EVT ScalarTy = Scalar.getValueType();
6542 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6543 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6544
6545 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6546 Op.getOperand(1), Scalar);
6547 }
6548 case Intrinsic::aarch64_sve_rbit:
6549 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6550 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6551 Op.getOperand(1));
6552 case Intrinsic::aarch64_sve_revb:
6553 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6554 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6555 case Intrinsic::aarch64_sve_revh:
6556 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6557 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6558 case Intrinsic::aarch64_sve_revw:
6559 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6560 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6561 case Intrinsic::aarch64_sve_revd:
6562 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6563 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6564 case Intrinsic::aarch64_sve_sxtb:
6565 return DAG.getNode(
6566 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6567 Op.getOperand(2), Op.getOperand(3),
6568 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6569 Op.getOperand(1));
6570 case Intrinsic::aarch64_sve_sxth:
6571 return DAG.getNode(
6572 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6573 Op.getOperand(2), Op.getOperand(3),
6574 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6575 Op.getOperand(1));
6576 case Intrinsic::aarch64_sve_sxtw:
6577 return DAG.getNode(
6578 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6579 Op.getOperand(2), Op.getOperand(3),
6580 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6581 Op.getOperand(1));
6582 case Intrinsic::aarch64_sve_uxtb:
6583 return DAG.getNode(
6584 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6585 Op.getOperand(2), Op.getOperand(3),
6586 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6587 Op.getOperand(1));
6588 case Intrinsic::aarch64_sve_uxth:
6589 return DAG.getNode(
6590 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6591 Op.getOperand(2), Op.getOperand(3),
6592 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6593 Op.getOperand(1));
6594 case Intrinsic::aarch64_sve_uxtw:
6595 return DAG.getNode(
6596 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6597 Op.getOperand(2), Op.getOperand(3),
6598 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6599 Op.getOperand(1));
6600 case Intrinsic::localaddress: {
6601 const auto &MF = DAG.getMachineFunction();
6602 const auto *RegInfo = Subtarget->getRegisterInfo();
6603 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6604 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6605 Op.getSimpleValueType());
6606 }
6607
6608 case Intrinsic::eh_recoverfp: {
6609 // FIXME: This needs to be implemented to correctly handle highly aligned
6610 // stack objects. For now we simply return the incoming FP. Refer D53541
6611 // for more details.
6612 SDValue FnOp = Op.getOperand(1);
6613 SDValue IncomingFPOp = Op.getOperand(2);
6614 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6615 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6616 if (!Fn)
6618 "llvm.eh.recoverfp must take a function as the first argument");
6619 return IncomingFPOp;
6620 }
6621 case Intrinsic::aarch64_neon_vsri:
6622 case Intrinsic::aarch64_neon_vsli:
6623 case Intrinsic::aarch64_sve_sri:
6624 case Intrinsic::aarch64_sve_sli: {
6625 EVT Ty = Op.getValueType();
6626
6627 if (!Ty.isVector())
6628 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6629
6630 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6631
6632 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6633 IntNo == Intrinsic::aarch64_sve_sri;
6634 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6635 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6636 Op.getOperand(3));
6637 }
6638
6639 case Intrinsic::aarch64_neon_srhadd:
6640 case Intrinsic::aarch64_neon_urhadd:
6641 case Intrinsic::aarch64_neon_shadd:
6642 case Intrinsic::aarch64_neon_uhadd: {
6643 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6644 IntNo == Intrinsic::aarch64_neon_shadd);
6645 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6646 IntNo == Intrinsic::aarch64_neon_urhadd);
6647 unsigned Opcode = IsSignedAdd
6648 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6649 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6650 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6651 Op.getOperand(2));
6652 }
6653 case Intrinsic::aarch64_neon_saddlp:
6654 case Intrinsic::aarch64_neon_uaddlp: {
6655 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6656 ? AArch64ISD::UADDLP
6657 : AArch64ISD::SADDLP;
6658 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6659 }
6660 case Intrinsic::aarch64_neon_sdot:
6661 case Intrinsic::aarch64_neon_udot:
6662 case Intrinsic::aarch64_sve_sdot:
6663 case Intrinsic::aarch64_sve_udot: {
6664 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6665 IntNo == Intrinsic::aarch64_sve_udot)
6666 ? AArch64ISD::UDOT
6667 : AArch64ISD::SDOT;
6668 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6669 Op.getOperand(2), Op.getOperand(3));
6670 }
6671 case Intrinsic::aarch64_neon_usdot:
6672 case Intrinsic::aarch64_sve_usdot: {
6673 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6674 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6675 }
6676 case Intrinsic::aarch64_neon_saddlv:
6677 case Intrinsic::aarch64_neon_uaddlv: {
6678 EVT OpVT = Op.getOperand(1).getValueType();
6679 EVT ResVT = Op.getValueType();
6680 assert(
6681 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6682 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6683 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6684 "Unexpected aarch64_neon_u/saddlv type");
6685 (void)OpVT;
6686 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6687 SDValue ADDLV = DAG.getNode(
6688 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6689 : AArch64ISD::SADDLV,
6690 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6691 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6692 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6693 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6694 return EXTRACT_VEC_ELT;
6695 }
6696 case Intrinsic::experimental_cttz_elts: {
6697 SDValue CttzOp = Op.getOperand(1);
6698 EVT VT = CttzOp.getValueType();
6699 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6700
6701 if (VT.isFixedLengthVector()) {
6702 // We can use SVE instructions to lower this intrinsic by first creating
6703 // an SVE predicate register mask from the fixed-width vector.
6704 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6705 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6706 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6707 }
6708
6709 SDValue NewCttzElts =
6710 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6711 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6712 }
6713 case Intrinsic::experimental_vector_match: {
6714 return LowerVectorMatch(Op, DAG);
6715 }
6716 }
6717}
6718
6719bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6720 if (VT.getVectorElementType() == MVT::i8 ||
6721 VT.getVectorElementType() == MVT::i16) {
6722 EltTy = MVT::i32;
6723 return true;
6724 }
6725 return false;
6726}
6727
6728bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6729 EVT DataVT) const {
6730 const EVT IndexVT = Extend.getOperand(0).getValueType();
6731 // SVE only supports implicit extension of 32-bit indices.
6732 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6733 return false;
6734
6735 // Indices cannot be smaller than the main data type.
6736 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6737 return false;
6738
6739 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6740 // element container type, which would violate the previous clause.
6741 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6742}
6743
6744bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6745 EVT ExtVT = ExtVal.getValueType();
6746 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6747 return false;
6748
6749 // It may be worth creating extending masked loads if there are multiple
6750 // masked loads using the same predicate. That way we'll end up creating
6751 // extending masked loads that may then get split by the legaliser. This
6752 // results in just one set of predicate unpacks at the start, instead of
6753 // multiple sets of vector unpacks after each load.
6754 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6755 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6756 // Disable extending masked loads for fixed-width for now, since the code
6757 // quality doesn't look great.
6758 if (!ExtVT.isScalableVector())
6759 return false;
6760
6761 unsigned NumExtMaskedLoads = 0;
6762 for (auto *U : Ld->getMask()->users())
6763 if (isa<MaskedLoadSDNode>(U))
6764 NumExtMaskedLoads++;
6765
6766 if (NumExtMaskedLoads <= 1)
6767 return false;
6768 }
6769 }
6770
6771 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6772 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6773 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6774}
6775
6776unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6777 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6778 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6779 AArch64ISD::GLD1_MERGE_ZERO},
6780 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6781 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6782 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6783 AArch64ISD::GLD1_MERGE_ZERO},
6784 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6785 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6786 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6787 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6788 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6789 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6790 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6791 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6792 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6793 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6794 };
6795 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6796 return AddrModes.find(Key)->second;
6797}
6798
6799unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6800 switch (Opcode) {
6801 default:
6802 llvm_unreachable("unimplemented opcode");
6803 return Opcode;
6804 case AArch64ISD::GLD1_MERGE_ZERO:
6805 return AArch64ISD::GLD1S_MERGE_ZERO;
6806 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6807 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6808 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6809 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6810 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6811 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6812 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6813 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6814 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6815 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6816 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6817 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6818 }
6819}
6820
6821SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6822 SelectionDAG &DAG) const {
6823 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6824
6825 SDLoc DL(Op);
6826 SDValue Chain = MGT->getChain();
6827 SDValue PassThru = MGT->getPassThru();
6828 SDValue Mask = MGT->getMask();
6829 SDValue BasePtr = MGT->getBasePtr();
6830 SDValue Index = MGT->getIndex();
6831 SDValue Scale = MGT->getScale();
6832 EVT VT = Op.getValueType();
6833 EVT MemVT = MGT->getMemoryVT();
6834 ISD::LoadExtType ExtType = MGT->getExtensionType();
6835 ISD::MemIndexType IndexType = MGT->getIndexType();
6836
6837 // SVE supports zero (and so undef) passthrough values only, everything else
6838 // must be handled manually by an explicit select on the load's output.
6839 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6840 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6841 SDValue Load =
6842 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6843 MGT->getMemOperand(), IndexType, ExtType);
6844 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6845 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6846 }
6847
6848 bool IsScaled = MGT->isIndexScaled();
6849 bool IsSigned = MGT->isIndexSigned();
6850
6851 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6852 // must be calculated before hand.
6853 uint64_t ScaleVal = Scale->getAsZExtVal();
6854 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6855 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6856 EVT IndexVT = Index.getValueType();
6857 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6858 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6859 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6860
6861 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6862 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6863 MGT->getMemOperand(), IndexType, ExtType);
6864 }
6865
6866 // Lower fixed length gather to a scalable equivalent.
6867 if (VT.isFixedLengthVector()) {
6868 assert(Subtarget->useSVEForFixedLengthVectors() &&
6869 "Cannot lower when not using SVE for fixed vectors!");
6870
6871 // NOTE: Handle floating-point as if integer then bitcast the result.
6872 EVT DataVT = VT.changeVectorElementTypeToInteger();
6873 MemVT = MemVT.changeVectorElementTypeToInteger();
6874
6875 // Find the smallest integer fixed length vector we can use for the gather.
6876 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6877 if (DataVT.getVectorElementType() == MVT::i64 ||
6878 Index.getValueType().getVectorElementType() == MVT::i64 ||
6879 Mask.getValueType().getVectorElementType() == MVT::i64)
6880 PromotedVT = VT.changeVectorElementType(MVT::i64);
6881
6882 // Promote vector operands except for passthrough, which we know is either
6883 // undef or zero, and thus best constructed directly.
6884 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6885 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6886 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6887
6888 // A promoted result type forces the need for an extending load.
6889 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6890 ExtType = ISD::EXTLOAD;
6891
6892 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6893
6894 // Convert fixed length vector operands to scalable.
6895 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6896 Index = convertToScalableVector(DAG, ContainerVT, Index);
6898 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6899 : DAG.getConstant(0, DL, ContainerVT);
6900
6901 // Emit equivalent scalable vector gather.
6902 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6903 SDValue Load =
6904 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6905 Ops, MGT->getMemOperand(), IndexType, ExtType);
6906
6907 // Extract fixed length data then convert to the required result type.
6908 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6909 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6910 if (VT.isFloatingPoint())
6911 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6912
6913 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6914 }
6915
6916 // Everything else is legal.
6917 return Op;
6918}
6919
6920SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6921 SelectionDAG &DAG) const {
6922 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6923
6924 SDLoc DL(Op);
6925 SDValue Chain = MSC->getChain();
6926 SDValue StoreVal = MSC->getValue();
6927 SDValue Mask = MSC->getMask();
6928 SDValue BasePtr = MSC->getBasePtr();
6929 SDValue Index = MSC->getIndex();
6930 SDValue Scale = MSC->getScale();
6931 EVT VT = StoreVal.getValueType();
6932 EVT MemVT = MSC->getMemoryVT();
6933 ISD::MemIndexType IndexType = MSC->getIndexType();
6934 bool Truncating = MSC->isTruncatingStore();
6935
6936 bool IsScaled = MSC->isIndexScaled();
6937 bool IsSigned = MSC->isIndexSigned();
6938
6939 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6940 // must be calculated before hand.
6941 uint64_t ScaleVal = Scale->getAsZExtVal();
6942 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6943 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6944 EVT IndexVT = Index.getValueType();
6945 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6946 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6947 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6948
6949 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6950 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6951 MSC->getMemOperand(), IndexType, Truncating);
6952 }
6953
6954 // Lower fixed length scatter to a scalable equivalent.
6955 if (VT.isFixedLengthVector()) {
6956 assert(Subtarget->useSVEForFixedLengthVectors() &&
6957 "Cannot lower when not using SVE for fixed vectors!");
6958
6959 // Once bitcast we treat floating-point scatters as if integer.
6960 if (VT.isFloatingPoint()) {
6962 MemVT = MemVT.changeVectorElementTypeToInteger();
6963 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6964 }
6965
6966 // Find the smallest integer fixed length vector we can use for the scatter.
6967 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6968 if (VT.getVectorElementType() == MVT::i64 ||
6969 Index.getValueType().getVectorElementType() == MVT::i64 ||
6970 Mask.getValueType().getVectorElementType() == MVT::i64)
6971 PromotedVT = VT.changeVectorElementType(MVT::i64);
6972
6973 // Promote vector operands.
6974 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6975 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6976 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6977 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6978
6979 // A promoted value type forces the need for a truncating store.
6980 if (PromotedVT != VT)
6981 Truncating = true;
6982
6983 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6984
6985 // Convert fixed length vector operands to scalable.
6986 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6987 Index = convertToScalableVector(DAG, ContainerVT, Index);
6989 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6990
6991 // Emit equivalent scalable vector scatter.
6992 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6993 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6994 MSC->getMemOperand(), IndexType, Truncating);
6995 }
6996
6997 // Everything else is legal.
6998 return Op;
6999}
7000
7001SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7002 SDLoc DL(Op);
7003 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7004 assert(LoadNode && "Expected custom lowering of a masked load node");
7005 EVT VT = Op->getValueType(0);
7006
7007 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7008 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7009
7010 SDValue PassThru = LoadNode->getPassThru();
7011 SDValue Mask = LoadNode->getMask();
7012
7013 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7014 return Op;
7015
7017 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7018 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7019 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7020 LoadNode->getExtensionType());
7021
7022 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7023
7024 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7025}
7026
7027// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7029 EVT VT, EVT MemVT,
7030 SelectionDAG &DAG) {
7031 assert(VT.isVector() && "VT should be a vector type");
7032 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7033
7034 SDValue Value = ST->getValue();
7035
7036 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7037 // the word lane which represent the v4i8 subvector. It optimizes the store
7038 // to:
7039 //
7040 // xtn v0.8b, v0.8h
7041 // str s0, [x0]
7042
7043 SDValue Undef = DAG.getUNDEF(MVT::i16);
7044 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7045 {Undef, Undef, Undef, Undef});
7046
7047 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7048 Value, UndefVec);
7049 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7050
7051 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7052 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7053 Trunc, DAG.getConstant(0, DL, MVT::i64));
7054
7055 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7056 ST->getBasePtr(), ST->getMemOperand());
7057}
7058
7060 SDLoc DL(Op);
7061 SDValue Src = Op.getOperand(0);
7062 MVT DestVT = Op.getSimpleValueType();
7063 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7065
7066 unsigned SrcAS = N->getSrcAddressSpace();
7067 unsigned DestAS = N->getDestAddressSpace();
7068 assert(SrcAS != DestAS &&
7069 "addrspacecast must be between different address spaces");
7070 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7071 TLI.getTargetMachine().getPointerSize(DestAS) &&
7072 "addrspacecast must be between different ptr sizes");
7073 (void)TLI;
7074
7075 if (SrcAS == ARM64AS::PTR32_SPTR) {
7076 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7077 DAG.getTargetConstant(0, DL, DestVT));
7078 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7079 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7080 DAG.getTargetConstant(0, DL, DestVT));
7081 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7082 (DestAS == ARM64AS::PTR32_UPTR)) {
7083 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7084 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7085 return Trunc;
7086 } else {
7087 return Src;
7088 }
7089}
7090
7091// Custom lowering for any store, vector or scalar and/or default or with
7092// a truncate operations. Currently only custom lower truncate operation
7093// from vector v4i16 to v4i8 or volatile stores of i128.
7094SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7095 SelectionDAG &DAG) const {
7096 SDLoc Dl(Op);
7097 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7098 assert (StoreNode && "Can only custom lower store nodes");
7099
7100 SDValue Value = StoreNode->getValue();
7101
7102 EVT VT = Value.getValueType();
7103 EVT MemVT = StoreNode->getMemoryVT();
7104
7105 if (VT.isVector()) {
7107 VT,
7108 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7109 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7110
7111 unsigned AS = StoreNode->getAddressSpace();
7112 Align Alignment = StoreNode->getAlign();
7113 if (Alignment < MemVT.getStoreSize() &&
7114 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7115 StoreNode->getMemOperand()->getFlags(),
7116 nullptr)) {
7117 return scalarizeVectorStore(StoreNode, DAG);
7118 }
7119
7120 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7121 MemVT == MVT::v4i8) {
7122 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7123 }
7124 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7125 // the custom lowering, as there are no un-paired non-temporal stores and
7126 // legalization will break up 256 bit inputs.
7127 ElementCount EC = MemVT.getVectorElementCount();
7128 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7129 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7130 (MemVT.getScalarSizeInBits() == 8u ||
7131 MemVT.getScalarSizeInBits() == 16u ||
7132 MemVT.getScalarSizeInBits() == 32u ||
7133 MemVT.getScalarSizeInBits() == 64u)) {
7134 SDValue Lo =
7137 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7138 SDValue Hi =
7141 StoreNode->getValue(),
7142 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7144 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7145 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7146 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7147 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7148 return Result;
7149 }
7150 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7151 return LowerStore128(Op, DAG);
7152 } else if (MemVT == MVT::i64x8) {
7153 SDValue Value = StoreNode->getValue();
7154 assert(Value->getValueType(0) == MVT::i64x8);
7155 SDValue Chain = StoreNode->getChain();
7156 SDValue Base = StoreNode->getBasePtr();
7157 EVT PtrVT = Base.getValueType();
7158 for (unsigned i = 0; i < 8; i++) {
7159 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7160 Value, DAG.getConstant(i, Dl, MVT::i32));
7161 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7162 DAG.getConstant(i * 8, Dl, PtrVT));
7163 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7164 StoreNode->getBaseAlign());
7165 }
7166 return Chain;
7167 }
7168
7169 return SDValue();
7170}
7171
7172/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7173SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7174 SelectionDAG &DAG) const {
7175 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7176 assert(StoreNode->getMemoryVT() == MVT::i128);
7177 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7178
7179 bool IsStoreRelease =
7181 if (StoreNode->isAtomic())
7182 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7183 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7186
7187 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7188 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7189 ? StoreNode->getOperand(1)
7190 : StoreNode->getOperand(2);
7191 SDLoc DL(Op);
7192 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7193 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7194 if (DAG.getDataLayout().isBigEndian())
7195 std::swap(StoreValue.first, StoreValue.second);
7197 Opcode, DL, DAG.getVTList(MVT::Other),
7198 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7199 StoreNode->getBasePtr()},
7200 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7201 return Result;
7202}
7203
7204SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7205 SelectionDAG &DAG) const {
7206 SDLoc DL(Op);
7207 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7208 assert(LoadNode && "Expected custom lowering of a load node");
7209
7210 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7212 SDValue Base = LoadNode->getBasePtr();
7213 SDValue Chain = LoadNode->getChain();
7214 EVT PtrVT = Base.getValueType();
7215 for (unsigned i = 0; i < 8; i++) {
7216 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7217 DAG.getConstant(i * 8, DL, PtrVT));
7218 SDValue Part =
7219 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7220 LoadNode->getBaseAlign());
7221 Ops.push_back(Part);
7222 Chain = SDValue(Part.getNode(), 1);
7223 }
7224 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7225 return DAG.getMergeValues({Loaded, Chain}, DL);
7226 }
7227
7228 // Custom lowering for extending v4i8 vector loads.
7229 EVT VT = Op->getValueType(0);
7230 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7231
7232 if (LoadNode->getMemoryVT() != MVT::v4i8)
7233 return SDValue();
7234
7235 // Avoid generating unaligned loads.
7236 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7237 return SDValue();
7238
7239 unsigned ExtType;
7240 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7241 ExtType = ISD::SIGN_EXTEND;
7242 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7243 LoadNode->getExtensionType() == ISD::EXTLOAD)
7244 ExtType = ISD::ZERO_EXTEND;
7245 else
7246 return SDValue();
7247
7248 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7249 LoadNode->getBasePtr(), MachinePointerInfo());
7250 SDValue Chain = Load.getValue(1);
7251 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7252 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7253 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7254 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7255 DAG.getConstant(0, DL, MVT::i64));
7256 if (VT == MVT::v4i32)
7257 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7258 return DAG.getMergeValues({Ext, Chain}, DL);
7259}
7260
7261SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7262 SelectionDAG &DAG) const {
7263 SDLoc DL(Op);
7264 SDValue Vec = Op.getOperand(0);
7265 SDValue Mask = Op.getOperand(1);
7266 SDValue Passthru = Op.getOperand(2);
7267 EVT VecVT = Vec.getValueType();
7268 EVT MaskVT = Mask.getValueType();
7269 EVT ElmtVT = VecVT.getVectorElementType();
7270 const bool IsFixedLength = VecVT.isFixedLengthVector();
7271 const bool HasPassthru = !Passthru.isUndef();
7272 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7273 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7274
7275 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7276
7277 if (!Subtarget->isSVEAvailable())
7278 return SDValue();
7279
7280 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7281 return SDValue();
7282
7283 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7284 if (MinElmts != 2 && MinElmts != 4)
7285 return SDValue();
7286
7287 // We can use the SVE register containing the NEON vector in its lowest bits.
7288 if (IsFixedLength) {
7289 EVT ScalableVecVT =
7290 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7291 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7292 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7293
7294 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7295 DAG.getUNDEF(ScalableVecVT), Vec,
7296 DAG.getConstant(0, DL, MVT::i64));
7297 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7298 DAG.getUNDEF(ScalableMaskVT), Mask,
7299 DAG.getConstant(0, DL, MVT::i64));
7301 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7302 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7303 DAG.getUNDEF(ScalableVecVT), Passthru,
7304 DAG.getConstant(0, DL, MVT::i64));
7305
7306 VecVT = Vec.getValueType();
7307 MaskVT = Mask.getValueType();
7308 }
7309
7310 // Get legal type for compact instruction
7311 EVT ContainerVT = getSVEContainerType(VecVT);
7312 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7313
7314 // Convert to i32 or i64 for smaller types, as these are the only supported
7315 // sizes for compact.
7316 if (ContainerVT != VecVT) {
7317 Vec = DAG.getBitcast(CastVT, Vec);
7318 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7319 }
7320
7321 SDValue Compressed = DAG.getNode(
7323 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7324
7325 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7326 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7327 SDValue Offset = DAG.getNode(
7328 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7329 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7330
7331 SDValue IndexMask = DAG.getNode(
7332 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7333 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7334 DAG.getConstant(0, DL, MVT::i64), Offset);
7335
7336 Compressed =
7337 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7338 }
7339
7340 // Extracting from a legal SVE type before truncating produces better code.
7341 if (IsFixedLength) {
7342 Compressed = DAG.getNode(
7344 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7345 Compressed, DAG.getConstant(0, DL, MVT::i64));
7346 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7347 VecVT = FixedVecVT;
7348 }
7349
7350 // If we changed the element type before, we need to convert it back.
7351 if (ContainerVT != VecVT) {
7352 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7353 Compressed = DAG.getBitcast(VecVT, Compressed);
7354 }
7355
7356 return Compressed;
7357}
7358
7359// Generate SUBS and CSEL for integer abs.
7360SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7361 MVT VT = Op.getSimpleValueType();
7362
7363 if (VT.isVector())
7364 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7365
7366 SDLoc DL(Op);
7367 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7368
7369 // Generate SUBS & CSEL.
7370 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7371 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7372 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7373 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7374}
7375
7377 SDValue Chain = Op.getOperand(0);
7378 SDValue Cond = Op.getOperand(1);
7379 SDValue Dest = Op.getOperand(2);
7380
7382 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7383 SDLoc DL(Op);
7384 SDValue CCVal = getCondCode(DAG, CC);
7385 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7386 Cmp);
7387 }
7388
7389 return SDValue();
7390}
7391
7392// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7393// FSHL is converted to FSHR before deciding what to do with it
7395 SDValue Shifts = Op.getOperand(2);
7396 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7397 // If opcode is FSHL, convert it to FSHR
7398 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7399 SDLoc DL(Op);
7400 MVT VT = Op.getSimpleValueType();
7401 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7402
7403 if (Op.getOpcode() == ISD::FSHL) {
7404 if (NewShiftNo == 0)
7405 return Op.getOperand(0);
7406
7407 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7408 return DAG.getNode(
7409 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7410 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7411 }
7412
7413 if (Op.getOpcode() == ISD::FSHR) {
7414 if (NewShiftNo == 0)
7415 return Op.getOperand(1);
7416
7417 if (ShiftNo->getZExtValue() == NewShiftNo)
7418 return Op;
7419
7420 // Rewrite using the normalised shift amount.
7421 return DAG.getNode(
7422 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7423 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7424 }
7425 }
7426
7427 return SDValue();
7428}
7429
7431 SDValue X = Op.getOperand(0);
7432 EVT XScalarTy = X.getValueType();
7433 SDValue Exp = Op.getOperand(1);
7434
7435 SDLoc DL(Op);
7436 EVT XVT, ExpVT;
7437 switch (Op.getSimpleValueType().SimpleTy) {
7438 default:
7439 return SDValue();
7440 case MVT::bf16:
7441 case MVT::f16:
7442 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7443 [[fallthrough]];
7444 case MVT::f32:
7445 XVT = MVT::nxv4f32;
7446 ExpVT = MVT::nxv4i32;
7447 break;
7448 case MVT::f64:
7449 XVT = MVT::nxv2f64;
7450 ExpVT = MVT::nxv2i64;
7451 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7452 break;
7453 }
7454
7455 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7456 SDValue VX =
7457 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7458 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7459 DAG.getUNDEF(ExpVT), Exp, Zero);
7460 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7461 AArch64SVEPredPattern::all);
7462 SDValue FScale =
7464 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7465 VPg, VX, VExp);
7466 SDValue Final =
7467 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7468 if (X.getValueType() != XScalarTy)
7469 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7470 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7471 return Final;
7472}
7473
7474SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7475 SelectionDAG &DAG) const {
7476 return Op.getOperand(0);
7477}
7478
7479SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7480 SelectionDAG &DAG) const {
7481 SDValue Chain = Op.getOperand(0);
7482 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7483 SDValue FPtr = Op.getOperand(2); // nested function
7484 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7485
7486 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7487
7488 // ldr NestReg, .+16
7489 // ldr x17, .+20
7490 // br x17
7491 // .word 0
7492 // .nest: .qword nest
7493 // .fptr: .qword fptr
7494 SDValue OutChains[5];
7495
7496 const Function *Func =
7497 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7498 CallingConv::ID CC = Func->getCallingConv();
7499 unsigned NestReg;
7500
7501 switch (CC) {
7502 default:
7503 NestReg = 0x0f; // X15
7504 break;
7506 // Must be kept in sync with AArch64CallingConv.td
7507 NestReg = 0x04; // X4
7508 break;
7509 }
7510
7511 const char FptrReg = 0x11; // X17
7512
7513 SDValue Addr = Trmp;
7514
7515 SDLoc DL(Op);
7516 OutChains[0] = DAG.getStore(
7517 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7518 MachinePointerInfo(TrmpAddr));
7519
7520 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7521 DAG.getConstant(4, DL, MVT::i64));
7522 OutChains[1] = DAG.getStore(
7523 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7524 MachinePointerInfo(TrmpAddr, 4));
7525
7526 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7527 DAG.getConstant(8, DL, MVT::i64));
7528 OutChains[2] =
7529 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7530 MachinePointerInfo(TrmpAddr, 8));
7531
7532 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7533 DAG.getConstant(16, DL, MVT::i64));
7534 OutChains[3] =
7535 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7536
7537 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7538 DAG.getConstant(24, DL, MVT::i64));
7539 OutChains[4] =
7540 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7541
7542 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7543
7544 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7545 DAG.getConstant(12, DL, MVT::i64));
7546
7547 // Call clear cache on the trampoline instructions.
7548 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7549 EndOfTrmp);
7550}
7551
7553 SelectionDAG &DAG) const {
7554 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7555 LLVM_DEBUG(Op.dump());
7556
7557 switch (Op.getOpcode()) {
7558 default:
7559 llvm_unreachable("unimplemented operand");
7560 return SDValue();
7563 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7564 case ISD::BITCAST:
7565 return LowerBITCAST(Op, DAG);
7566 case ISD::GlobalAddress:
7567 return LowerGlobalAddress(Op, DAG);
7569 return LowerGlobalTLSAddress(Op, DAG);
7571 return LowerPtrAuthGlobalAddress(Op, DAG);
7572 case ISD::ADJUST_TRAMPOLINE:
7573 return LowerADJUST_TRAMPOLINE(Op, DAG);
7574 case ISD::INIT_TRAMPOLINE:
7575 return LowerINIT_TRAMPOLINE(Op, DAG);
7576 case ISD::SETCC:
7577 case ISD::STRICT_FSETCC:
7579 return LowerSETCC(Op, DAG);
7580 case ISD::SETCCCARRY:
7581 return LowerSETCCCARRY(Op, DAG);
7582 case ISD::BRCOND:
7583 return LowerBRCOND(Op, DAG);
7584 case ISD::BR_CC:
7585 return LowerBR_CC(Op, DAG);
7586 case ISD::SELECT:
7587 return LowerSELECT(Op, DAG);
7588 case ISD::SELECT_CC:
7589 return LowerSELECT_CC(Op, DAG);
7590 case ISD::JumpTable:
7591 return LowerJumpTable(Op, DAG);
7592 case ISD::BR_JT:
7593 return LowerBR_JT(Op, DAG);
7594 case ISD::BRIND:
7595 return LowerBRIND(Op, DAG);
7596 case ISD::ConstantPool:
7597 return LowerConstantPool(Op, DAG);
7598 case ISD::BlockAddress:
7599 return LowerBlockAddress(Op, DAG);
7600 case ISD::VASTART:
7601 return LowerVASTART(Op, DAG);
7602 case ISD::VACOPY:
7603 return LowerVACOPY(Op, DAG);
7604 case ISD::VAARG:
7605 return LowerVAARG(Op, DAG);
7606 case ISD::UADDO_CARRY:
7607 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7608 case ISD::USUBO_CARRY:
7609 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7610 case ISD::SADDO_CARRY:
7611 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7612 case ISD::SSUBO_CARRY:
7613 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7614 case ISD::SADDO:
7615 case ISD::UADDO:
7616 case ISD::SSUBO:
7617 case ISD::USUBO:
7618 case ISD::SMULO:
7619 case ISD::UMULO:
7620 return LowerXALUO(Op, DAG);
7621 case ISD::FADD:
7622 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7623 case ISD::FSUB:
7624 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7625 case ISD::FMUL:
7626 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7627 case ISD::FMA:
7628 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7629 case ISD::FDIV:
7630 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7631 case ISD::FNEG:
7632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7633 case ISD::FCEIL:
7634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7635 case ISD::FFLOOR:
7636 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7637 case ISD::FNEARBYINT:
7638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7639 case ISD::FRINT:
7640 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7641 case ISD::FROUND:
7642 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7643 case ISD::FROUNDEVEN:
7644 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7645 case ISD::FTRUNC:
7646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7647 case ISD::FSQRT:
7648 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7649 case ISD::FABS:
7650 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7651 case ISD::FP_ROUND:
7653 return LowerFP_ROUND(Op, DAG);
7654 case ISD::FP_EXTEND:
7656 return LowerFP_EXTEND(Op, DAG);
7657 case ISD::FRAMEADDR:
7658 return LowerFRAMEADDR(Op, DAG);
7659 case ISD::SPONENTRY:
7660 return LowerSPONENTRY(Op, DAG);
7661 case ISD::RETURNADDR:
7662 return LowerRETURNADDR(Op, DAG);
7664 return LowerADDROFRETURNADDR(Op, DAG);
7666 return LowerCONCAT_VECTORS(Op, DAG);
7668 return LowerINSERT_VECTOR_ELT(Op, DAG);
7670 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7671 case ISD::BUILD_VECTOR:
7672 return LowerBUILD_VECTOR(Op, DAG);
7674 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7676 return LowerVECTOR_SHUFFLE(Op, DAG);
7677 case ISD::SPLAT_VECTOR:
7678 return LowerSPLAT_VECTOR(Op, DAG);
7680 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7682 return LowerINSERT_SUBVECTOR(Op, DAG);
7683 case ISD::SDIV:
7684 case ISD::UDIV:
7685 return LowerDIV(Op, DAG);
7686 case ISD::SMIN:
7687 case ISD::UMIN:
7688 case ISD::SMAX:
7689 case ISD::UMAX:
7690 return LowerMinMax(Op, DAG);
7691 case ISD::SRA:
7692 case ISD::SRL:
7693 case ISD::SHL:
7694 return LowerVectorSRA_SRL_SHL(Op, DAG);
7695 case ISD::SHL_PARTS:
7696 case ISD::SRL_PARTS:
7697 case ISD::SRA_PARTS:
7698 return LowerShiftParts(Op, DAG);
7699 case ISD::CTPOP:
7700 case ISD::PARITY:
7701 return LowerCTPOP_PARITY(Op, DAG);
7702 case ISD::FCOPYSIGN:
7703 return LowerFCOPYSIGN(Op, DAG);
7704 case ISD::OR:
7705 return LowerVectorOR(Op, DAG);
7706 case ISD::XOR:
7707 return LowerXOR(Op, DAG);
7708 case ISD::PREFETCH:
7709 return LowerPREFETCH(Op, DAG);
7710 case ISD::SINT_TO_FP:
7711 case ISD::UINT_TO_FP:
7714 return LowerINT_TO_FP(Op, DAG);
7715 case ISD::FP_TO_SINT:
7716 case ISD::FP_TO_UINT:
7719 return LowerFP_TO_INT(Op, DAG);
7722 return LowerFP_TO_INT_SAT(Op, DAG);
7723 case ISD::FSINCOS:
7724 return LowerFSINCOS(Op, DAG);
7725 case ISD::GET_ROUNDING:
7726 return LowerGET_ROUNDING(Op, DAG);
7727 case ISD::SET_ROUNDING:
7728 return LowerSET_ROUNDING(Op, DAG);
7729 case ISD::GET_FPMODE:
7730 return LowerGET_FPMODE(Op, DAG);
7731 case ISD::SET_FPMODE:
7732 return LowerSET_FPMODE(Op, DAG);
7733 case ISD::RESET_FPMODE:
7734 return LowerRESET_FPMODE(Op, DAG);
7735 case ISD::MUL:
7736 return LowerMUL(Op, DAG);
7737 case ISD::MULHS:
7738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7739 case ISD::MULHU:
7740 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7742 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7744 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7746 return LowerINTRINSIC_VOID(Op, DAG);
7747 case ISD::ATOMIC_STORE:
7748 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7749 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7750 return LowerStore128(Op, DAG);
7751 }
7752 return SDValue();
7753 case ISD::STORE:
7754 return LowerSTORE(Op, DAG);
7755 case ISD::MSTORE:
7756 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7757 case ISD::MGATHER:
7758 return LowerMGATHER(Op, DAG);
7759 case ISD::MSCATTER:
7760 return LowerMSCATTER(Op, DAG);
7761 case ISD::VECREDUCE_SEQ_FADD:
7762 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7763 case ISD::VECREDUCE_ADD:
7764 case ISD::VECREDUCE_AND:
7765 case ISD::VECREDUCE_OR:
7766 case ISD::VECREDUCE_XOR:
7767 case ISD::VECREDUCE_SMAX:
7768 case ISD::VECREDUCE_SMIN:
7769 case ISD::VECREDUCE_UMAX:
7770 case ISD::VECREDUCE_UMIN:
7771 case ISD::VECREDUCE_FADD:
7772 case ISD::VECREDUCE_FMAX:
7773 case ISD::VECREDUCE_FMIN:
7774 case ISD::VECREDUCE_FMAXIMUM:
7775 case ISD::VECREDUCE_FMINIMUM:
7776 return LowerVECREDUCE(Op, DAG);
7777 case ISD::ATOMIC_LOAD_AND:
7778 return LowerATOMIC_LOAD_AND(Op, DAG);
7779 case ISD::DYNAMIC_STACKALLOC:
7780 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7781 case ISD::VSCALE:
7782 return LowerVSCALE(Op, DAG);
7784 return LowerVECTOR_COMPRESS(Op, DAG);
7785 case ISD::ANY_EXTEND:
7786 case ISD::SIGN_EXTEND:
7787 case ISD::ZERO_EXTEND:
7788 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7789 case ISD::ADDRSPACECAST:
7790 return LowerADDRSPACECAST(Op, DAG);
7792 // Only custom lower when ExtraVT has a legal byte based element type.
7793 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7794 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7795 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7796 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7797 return SDValue();
7798
7799 return LowerToPredicatedOp(Op, DAG,
7800 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7801 }
7802 case ISD::TRUNCATE:
7803 return LowerTRUNCATE(Op, DAG);
7804 case ISD::MLOAD:
7805 return LowerMLOAD(Op, DAG);
7806 case ISD::LOAD:
7807 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7808 !Subtarget->isNeonAvailable()))
7809 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7810 return LowerLOAD(Op, DAG);
7811 case ISD::ADD:
7812 case ISD::AND:
7813 case ISD::SUB:
7814 return LowerToScalableOp(Op, DAG);
7815 case ISD::FMAXIMUM:
7816 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7817 case ISD::FMAXNUM:
7818 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7819 case ISD::FMINIMUM:
7820 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7821 case ISD::FMINNUM:
7822 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7823 case ISD::VSELECT:
7824 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7825 case ISD::ABS:
7826 return LowerABS(Op, DAG);
7827 case ISD::ABDS:
7828 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7829 case ISD::ABDU:
7830 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7831 case ISD::AVGFLOORS:
7832 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7833 case ISD::AVGFLOORU:
7834 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7835 case ISD::AVGCEILS:
7836 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7837 case ISD::AVGCEILU:
7838 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7839 case ISD::BITREVERSE:
7840 return LowerBitreverse(Op, DAG);
7841 case ISD::BSWAP:
7842 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7843 case ISD::CTLZ:
7844 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7845 case ISD::CTTZ:
7846 return LowerCTTZ(Op, DAG);
7847 case ISD::VECTOR_SPLICE:
7848 return LowerVECTOR_SPLICE(Op, DAG);
7850 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7852 return LowerVECTOR_INTERLEAVE(Op, DAG);
7853 case ISD::GET_ACTIVE_LANE_MASK:
7854 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7855 case ISD::LRINT:
7856 case ISD::LLRINT:
7857 if (Op.getValueType().isVector())
7858 return LowerVectorXRINT(Op, DAG);
7859 [[fallthrough]];
7860 case ISD::LROUND:
7861 case ISD::LLROUND: {
7862 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7863 Op.getOperand(0).getValueType() == MVT::bf16) &&
7864 "Expected custom lowering of rounding operations only for f16");
7865 SDLoc DL(Op);
7866 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7867 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7868 }
7869 case ISD::STRICT_LROUND:
7871 case ISD::STRICT_LRINT:
7872 case ISD::STRICT_LLRINT: {
7873 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7874 Op.getOperand(1).getValueType() == MVT::bf16) &&
7875 "Expected custom lowering of rounding operations only for f16");
7876 SDLoc DL(Op);
7877 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7878 {Op.getOperand(0), Op.getOperand(1)});
7879 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7880 {Ext.getValue(1), Ext.getValue(0)});
7881 }
7882 case ISD::WRITE_REGISTER: {
7883 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7884 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7885 SDLoc DL(Op);
7886
7887 SDValue Chain = Op.getOperand(0);
7888 SDValue SysRegName = Op.getOperand(1);
7889 std::pair<SDValue, SDValue> Pair =
7890 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7891
7892 // chain = MSRR(chain, sysregname, lo, hi)
7893 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7894 SysRegName, Pair.first, Pair.second);
7895
7896 return Result;
7897 }
7898 case ISD::FSHL:
7899 case ISD::FSHR:
7900 return LowerFunnelShift(Op, DAG);
7901 case ISD::FLDEXP:
7902 return LowerFLDEXP(Op, DAG);
7903 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7904 return LowerVECTOR_HISTOGRAM(Op, DAG);
7905 case ISD::PARTIAL_REDUCE_SMLA:
7906 case ISD::PARTIAL_REDUCE_UMLA:
7907 case ISD::PARTIAL_REDUCE_SUMLA:
7908 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7909 }
7910}
7911
7913 return !Subtarget->useSVEForFixedLengthVectors();
7914}
7915
7917 EVT VT, bool OverrideNEON) const {
7918 if (!VT.isFixedLengthVector() || !VT.isSimple())
7919 return false;
7920
7921 // Don't use SVE for vectors we cannot scalarize if required.
7922 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7923 // Fixed length predicates should be promoted to i8.
7924 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7925 case MVT::i1:
7926 default:
7927 return false;
7928 case MVT::i8:
7929 case MVT::i16:
7930 case MVT::i32:
7931 case MVT::i64:
7932 case MVT::f16:
7933 case MVT::f32:
7934 case MVT::f64:
7935 break;
7936 }
7937
7938 // NEON-sized vectors can be emulated using SVE instructions.
7939 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7940 return Subtarget->isSVEorStreamingSVEAvailable();
7941
7942 // Ensure NEON MVTs only belong to a single register class.
7943 if (VT.getFixedSizeInBits() <= 128)
7944 return false;
7945
7946 // Ensure wider than NEON code generation is enabled.
7947 if (!Subtarget->useSVEForFixedLengthVectors())
7948 return false;
7949
7950 // Don't use SVE for types that don't fit.
7951 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7952 return false;
7953
7954 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7955 // the base fixed length SVE support in place.
7956 if (!VT.isPow2VectorType())
7957 return false;
7958
7959 return true;
7960}
7961
7962//===----------------------------------------------------------------------===//
7963// Calling Convention Implementation
7964//===----------------------------------------------------------------------===//
7965
7966static unsigned getIntrinsicID(const SDNode *N) {
7967 unsigned Opcode = N->getOpcode();
7968 switch (Opcode) {
7969 default:
7972 unsigned IID = N->getConstantOperandVal(0);
7973 if (IID < Intrinsic::num_intrinsics)
7974 return IID;
7976 }
7977 }
7978}
7979
7981 SDValue N1) const {
7982 if (!N0.hasOneUse())
7983 return false;
7984
7985 unsigned IID = getIntrinsicID(N1.getNode());
7986 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7987 if (IID == Intrinsic::aarch64_neon_umull ||
7988 N1.getOpcode() == AArch64ISD::UMULL ||
7989 IID == Intrinsic::aarch64_neon_smull ||
7990 N1.getOpcode() == AArch64ISD::SMULL)
7991 return N0.getOpcode() != ISD::ADD;
7992
7993 return true;
7994}
7995
7996/// Selects the correct CCAssignFn for a given CallingConvention value.
7998 bool IsVarArg) const {
7999 switch (CC) {
8000 default:
8001 reportFatalUsageError("unsupported calling convention");
8002 case CallingConv::GHC:
8003 return CC_AArch64_GHC;
8005 // The VarArg implementation makes assumptions about register
8006 // argument passing that do not hold for preserve_none, so we
8007 // instead fall back to C argument passing.
8008 // The non-vararg case is handled in the CC function itself.
8009 if (!IsVarArg)
8011 [[fallthrough]];
8012 case CallingConv::C:
8013 case CallingConv::Fast:
8017 case CallingConv::Swift:
8019 case CallingConv::Tail:
8020 case CallingConv::GRAAL:
8021 if (Subtarget->isTargetWindows()) {
8022 if (IsVarArg) {
8023 if (Subtarget->isWindowsArm64EC())
8026 }
8027 return CC_AArch64_Win64PCS;
8028 }
8029 if (!Subtarget->isTargetDarwin())
8030 return CC_AArch64_AAPCS;
8031 if (!IsVarArg)
8032 return CC_AArch64_DarwinPCS;
8033 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8035 case CallingConv::Win64:
8036 if (IsVarArg) {
8037 if (Subtarget->isWindowsArm64EC())
8040 }
8041 return CC_AArch64_Win64PCS;
8043 if (Subtarget->isWindowsArm64EC())
8051 return CC_AArch64_AAPCS;
8056 }
8057}
8058
8059CCAssignFn *
8061 switch (CC) {
8062 default:
8063 return RetCC_AArch64_AAPCS;
8067 if (Subtarget->isWindowsArm64EC())
8069 return RetCC_AArch64_AAPCS;
8070 }
8071}
8072
8073static bool isPassedInFPR(EVT VT) {
8074 return VT.isFixedLengthVector() ||
8075 (VT.isFloatingPoint() && !VT.isScalableVector());
8076}
8077
8079 AArch64FunctionInfo &FuncInfo,
8080 SelectionDAG &DAG) {
8081 if (!FuncInfo.hasZT0SpillSlotIndex())
8082 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8083
8084 return DAG.getFrameIndex(
8085 FuncInfo.getZT0SpillSlotIndex(),
8087}
8088
8089// Emit a call to __arm_sme_save or __arm_sme_restore.
8091 SelectionDAG &DAG,
8093 SDValue Chain, bool IsSave) {
8096 FuncInfo->setSMESaveBufferUsed();
8098 Args.emplace_back(
8099 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8101
8102 RTLIB::Libcall LC =
8103 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8104 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
8105 TLI.getPointerTy(DAG.getDataLayout()));
8106 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8108 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8109 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
8110 return TLI.LowerCallTo(CLI).second;
8111}
8112
8114 const AArch64TargetLowering &TLI,
8115 const AArch64RegisterInfo &TRI,
8116 AArch64FunctionInfo &FuncInfo,
8117 SelectionDAG &DAG) {
8118 // Conditionally restore the lazy save using a pseudo node.
8119 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8120 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8121 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8123 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8124 TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
8125 SDValue TPIDR2_EL0 = DAG.getNode(
8126 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8127 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8128 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8129 // RESTORE_ZA pseudo.
8130 SDValue Glue;
8131 SDValue TPIDR2Block = DAG.getFrameIndex(
8132 TPIDR2.FrameIndex,
8134 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8135 Chain =
8136 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8137 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8138 RestoreRoutine, RegMask, Chain.getValue(1)});
8139 // Finally reset the TPIDR2_EL0 register to 0.
8140 Chain = DAG.getNode(
8141 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8142 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8143 DAG.getConstant(0, DL, MVT::i64));
8144 TPIDR2.Uses++;
8145 return Chain;
8146}
8147
8148SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8149 SelectionDAG &DAG) const {
8150 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8151 SDValue Glue = Chain.getValue(1);
8152
8153 MachineFunction &MF = DAG.getMachineFunction();
8154 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8155 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8156 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8157
8158 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8159
8160 // The following conditions are true on entry to an exception handler:
8161 // - PSTATE.SM is 0.
8162 // - PSTATE.ZA is 0.
8163 // - TPIDR2_EL0 is null.
8164 // See:
8165 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8166 //
8167 // Therefore, if the function that contains this exception handler is a
8168 // streaming[-compatible] function, we must re-enable streaming mode.
8169 //
8170 // These mode changes are usually optimized away in catch blocks as they
8171 // occur before the __cxa_begin_catch (which is a non-streaming function),
8172 // but are necessary in some cases (such as for cleanups).
8173 //
8174 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8175
8176 // [COND_]SMSTART SM
8177 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8178 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8179 /*Glue*/ Glue, AArch64SME::Always);
8180 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8181 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8183
8184 if (getTM().useNewSMEABILowering())
8185 return Chain;
8186
8187 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8188 // Restore full ZA
8189 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8190 /*IsSave=*/false);
8191 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8192 // SMSTART ZA
8193 Chain = DAG.getNode(
8194 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8195 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8196
8197 // Restore ZT0
8198 if (SMEFnAttrs.hasZT0State()) {
8199 SDValue ZT0FrameIndex =
8200 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8201 Chain =
8202 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8203 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8204 }
8205
8206 // Restore ZA
8207 if (SMEFnAttrs.hasZAState())
8208 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8209 }
8210
8211 return Chain;
8212}
8213
8214SDValue AArch64TargetLowering::LowerFormalArguments(
8215 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8216 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8217 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8218 MachineFunction &MF = DAG.getMachineFunction();
8219 const Function &F = MF.getFunction();
8220 MachineFrameInfo &MFI = MF.getFrameInfo();
8221 bool IsWin64 =
8222 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8223 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8224 (isVarArg && Subtarget->isWindowsArm64EC());
8225 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8226
8228 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8230 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8231 FuncInfo->setIsSVECC(true);
8232
8233 // Assign locations to all of the incoming arguments.
8235 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8236
8237 // At this point, Ins[].VT may already be promoted to i32. To correctly
8238 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8239 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8240 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8241 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8242 // LocVT.
8243 unsigned NumArgs = Ins.size();
8244 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8245 unsigned CurArgIdx = 0;
8246 bool UseVarArgCC = false;
8247 if (IsWin64)
8248 UseVarArgCC = isVarArg;
8249
8250 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8251
8252 for (unsigned i = 0; i != NumArgs; ++i) {
8253 MVT ValVT = Ins[i].VT;
8254 if (Ins[i].isOrigArg()) {
8255 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8256 CurArgIdx = Ins[i].getOrigArgIndex();
8257
8258 // Get type of the original argument.
8259 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8260 /*AllowUnknown*/ true);
8261 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8262 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8263 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8264 ValVT = MVT::i8;
8265 else if (ActualMVT == MVT::i16)
8266 ValVT = MVT::i16;
8267 }
8268 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8269 Ins[i].OrigTy, CCInfo);
8270 assert(!Res && "Call operand has unhandled type");
8271 (void)Res;
8272 }
8273
8274 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8275 bool IsLocallyStreaming =
8276 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8277 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8278 SDValue Glue = Chain.getValue(1);
8279
8280 unsigned ExtraArgLocs = 0;
8281 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8282 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8283
8284 if (Ins[i].Flags.isByVal()) {
8285 // Byval is used for HFAs in the PCS, but the system should work in a
8286 // non-compliant manner for larger structs.
8287 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8288 int Size = Ins[i].Flags.getByValSize();
8289 unsigned NumRegs = (Size + 7) / 8;
8290
8291 // FIXME: This works on big-endian for composite byvals, which are the common
8292 // case. It should also work for fundamental types too.
8293 unsigned FrameIdx =
8294 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8295 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8296 InVals.push_back(FrameIdxN);
8297
8298 continue;
8299 }
8300
8301 if (Ins[i].Flags.isSwiftAsync())
8302 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8303
8304 SDValue ArgValue;
8305 if (VA.isRegLoc()) {
8306 // Arguments stored in registers.
8307 EVT RegVT = VA.getLocVT();
8308 const TargetRegisterClass *RC;
8309
8310 if (RegVT == MVT::i32)
8311 RC = &AArch64::GPR32RegClass;
8312 else if (RegVT == MVT::i64)
8313 RC = &AArch64::GPR64RegClass;
8314 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8315 RC = &AArch64::FPR16RegClass;
8316 else if (RegVT == MVT::f32)
8317 RC = &AArch64::FPR32RegClass;
8318 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8319 RC = &AArch64::FPR64RegClass;
8320 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8321 RC = &AArch64::FPR128RegClass;
8322 else if (RegVT.isScalableVector() &&
8323 RegVT.getVectorElementType() == MVT::i1) {
8324 FuncInfo->setIsSVECC(true);
8325 RC = &AArch64::PPRRegClass;
8326 } else if (RegVT == MVT::aarch64svcount) {
8327 FuncInfo->setIsSVECC(true);
8328 RC = &AArch64::PPRRegClass;
8329 } else if (RegVT.isScalableVector()) {
8330 FuncInfo->setIsSVECC(true);
8331 RC = &AArch64::ZPRRegClass;
8332 } else
8333 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8334
8335 // Transform the arguments in physical registers into virtual ones.
8336 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8337
8338 if (IsLocallyStreaming) {
8339 // LocallyStreamingFunctions must insert the SMSTART in the correct
8340 // position, so we use Glue to ensure no instructions can be scheduled
8341 // between the chain of:
8342 // t0: ch,glue = EntryNode
8343 // t1: res,ch,glue = CopyFromReg
8344 // ...
8345 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8346 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8347 // ^^^^^^
8348 // This will be the new Chain/Root node.
8349 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8350 Glue = ArgValue.getValue(2);
8351 if (isPassedInFPR(ArgValue.getValueType())) {
8352 ArgValue =
8353 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8354 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8355 {ArgValue, Glue});
8356 Glue = ArgValue.getValue(1);
8357 }
8358 } else
8359 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8360
8361 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8362 // to 64 bits. Insert an assert[sz]ext to capture this, then
8363 // truncate to the right size.
8364 switch (VA.getLocInfo()) {
8365 default:
8366 llvm_unreachable("Unknown loc info!");
8367 case CCValAssign::Full:
8368 break;
8370 assert(
8371 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8372 "Indirect arguments should be scalable on most subtargets");
8373 break;
8374 case CCValAssign::BCvt:
8375 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8376 break;
8377 case CCValAssign::AExt:
8378 case CCValAssign::SExt:
8379 case CCValAssign::ZExt:
8380 break;
8382 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8383 DAG.getConstant(32, DL, RegVT));
8384 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8385 break;
8386 }
8387 } else { // VA.isRegLoc()
8388 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8389 unsigned ArgOffset = VA.getLocMemOffset();
8390 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8391 ? VA.getLocVT().getSizeInBits()
8392 : VA.getValVT().getSizeInBits()) / 8;
8393
8394 uint32_t BEAlign = 0;
8395 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8396 !Ins[i].Flags.isInConsecutiveRegs())
8397 BEAlign = 8 - ArgSize;
8398
8399 SDValue FIN;
8400 MachinePointerInfo PtrInfo;
8401 if (StackViaX4) {
8402 // In both the ARM64EC varargs convention and the thunk convention,
8403 // arguments on the stack are accessed relative to x4, not sp. In
8404 // the thunk convention, there's an additional offset of 32 bytes
8405 // to account for the shadow store.
8406 unsigned ObjOffset = ArgOffset + BEAlign;
8407 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8408 ObjOffset += 32;
8409 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8410 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8411 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8412 DAG.getConstant(ObjOffset, DL, MVT::i64));
8414 } else {
8415 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8416
8417 // Create load nodes to retrieve arguments from the stack.
8418 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8419 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8420 }
8421
8422 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8424 MVT MemVT = VA.getValVT();
8425
8426 switch (VA.getLocInfo()) {
8427 default:
8428 break;
8429 case CCValAssign::Trunc:
8430 case CCValAssign::BCvt:
8431 MemVT = VA.getLocVT();
8432 break;
8435 Subtarget->isWindowsArm64EC()) &&
8436 "Indirect arguments should be scalable on most subtargets");
8437 MemVT = VA.getLocVT();
8438 break;
8439 case CCValAssign::SExt:
8440 ExtType = ISD::SEXTLOAD;
8441 break;
8442 case CCValAssign::ZExt:
8443 ExtType = ISD::ZEXTLOAD;
8444 break;
8445 case CCValAssign::AExt:
8446 ExtType = ISD::EXTLOAD;
8447 break;
8448 }
8449
8450 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8451 MemVT);
8452 }
8453
8454 if (VA.getLocInfo() == CCValAssign::Indirect) {
8455 assert((VA.getValVT().isScalableVT() ||
8456 Subtarget->isWindowsArm64EC()) &&
8457 "Indirect arguments should be scalable on most subtargets");
8458
8459 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8460 unsigned NumParts = 1;
8461 if (Ins[i].Flags.isInConsecutiveRegs()) {
8462 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8463 ++NumParts;
8464 }
8465
8466 MVT PartLoad = VA.getValVT();
8467 SDValue Ptr = ArgValue;
8468
8469 // Ensure we generate all loads for each tuple part, whilst updating the
8470 // pointer after each load correctly using vscale.
8471 while (NumParts > 0) {
8472 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8473 InVals.push_back(ArgValue);
8474 NumParts--;
8475 if (NumParts > 0) {
8476 SDValue BytesIncrement;
8477 if (PartLoad.isScalableVector()) {
8478 BytesIncrement = DAG.getVScale(
8479 DL, Ptr.getValueType(),
8480 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8481 } else {
8482 BytesIncrement = DAG.getConstant(
8483 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8484 Ptr.getValueType());
8485 }
8486 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8487 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8488 ExtraArgLocs++;
8489 i++;
8490 }
8491 }
8492 } else {
8493 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8494 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8495 ArgValue, DAG.getValueType(MVT::i32));
8496
8497 // i1 arguments are zero-extended to i8 by the caller. Emit a
8498 // hint to reflect this.
8499 if (Ins[i].isOrigArg()) {
8500 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8501 if (OrigArg->getType()->isIntegerTy(1)) {
8502 if (!Ins[i].Flags.isZExt()) {
8503 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8504 ArgValue.getValueType(), ArgValue);
8505 }
8506 }
8507 }
8508
8509 InVals.push_back(ArgValue);
8510 }
8511 }
8512 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8513
8514 if (Attrs.hasStreamingCompatibleInterface()) {
8515 SDValue EntryPStateSM =
8516 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8517 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8518
8519 // Copy the value to a virtual register, and save that in FuncInfo.
8520 Register EntryPStateSMReg =
8521 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8522 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8523 EntryPStateSM);
8524 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8525 }
8526
8527 // Insert the SMSTART if this is a locally streaming function and
8528 // make sure it is Glued to the last CopyFromReg value.
8529 if (IsLocallyStreaming) {
8530 if (Attrs.hasStreamingCompatibleInterface())
8531 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8533 else
8534 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8536
8537 // Ensure that the SMSTART happens after the CopyWithChain such that its
8538 // chain result is used.
8539 for (unsigned I=0; I<InVals.size(); ++I) {
8542 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8543 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8544 InVals[I].getValueType());
8545 }
8546 }
8547
8548 // varargs
8549 if (isVarArg) {
8551 if (!Subtarget->isTargetDarwin() || IsWin64) {
8552 // The AAPCS variadic function ABI is identical to the non-variadic
8553 // one. As a result there may be more arguments in registers and we
8554 // should save them for future reference.
8555 // Win64 variadic functions also pass arguments in registers, but all
8556 // float arguments are passed in integer registers.
8557 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8558 }
8559
8560 // This will point to the next argument passed via stack.
8561 unsigned VarArgsOffset = CCInfo.getStackSize();
8562 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8563 VarArgsOffset =
8564 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8565 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8566 FuncInfo->setVarArgsStackIndex(
8567 MFI.CreateFixedObject(4, VarArgsOffset, true));
8568 }
8569
8570 if (MFI.hasMustTailInVarArgFunc()) {
8571 SmallVector<MVT, 2> RegParmTypes;
8572 RegParmTypes.push_back(MVT::i64);
8573 RegParmTypes.push_back(MVT::f128);
8574 // Compute the set of forwarded registers. The rest are scratch.
8575 SmallVectorImpl<ForwardedRegister> &Forwards =
8576 FuncInfo->getForwardedMustTailRegParms();
8577 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8579
8580 // Conservatively forward X8, since it might be used for aggregate return.
8581 if (!CCInfo.isAllocated(AArch64::X8)) {
8582 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8583 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8584 }
8585 }
8586 }
8587
8588 // On Windows, InReg pointers must be returned, so record the pointer in a
8589 // virtual register at the start of the function so it can be returned in the
8590 // epilogue.
8591 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8592 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8593 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8594 Ins[I].Flags.isInReg()) &&
8595 Ins[I].Flags.isSRet()) {
8596 assert(!FuncInfo->getSRetReturnReg());
8597
8598 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8599 Register Reg =
8601 FuncInfo->setSRetReturnReg(Reg);
8602
8603 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8604 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8605 break;
8606 }
8607 }
8608 }
8609
8610 unsigned StackArgSize = CCInfo.getStackSize();
8611 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8612 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8613 // This is a non-standard ABI so by fiat I say we're allowed to make full
8614 // use of the stack area to be popped, which must be aligned to 16 bytes in
8615 // any case:
8616 StackArgSize = alignTo(StackArgSize, 16);
8617
8618 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8619 // a multiple of 16.
8620 FuncInfo->setArgumentStackToRestore(StackArgSize);
8621
8622 // This realignment carries over to the available bytes below. Our own
8623 // callers will guarantee the space is free by giving an aligned value to
8624 // CALLSEQ_START.
8625 }
8626 // Even if we're not expected to free up the space, it's useful to know how
8627 // much is there while considering tail calls (because we can reuse it).
8628 FuncInfo->setBytesInStackArgArea(StackArgSize);
8629
8630 if (Subtarget->hasCustomCallingConv())
8631 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8632
8633 if (getTM().useNewSMEABILowering()) {
8634 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8635 SDValue Size;
8636 if (Attrs.hasZAState()) {
8637 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8638 DAG.getConstant(1, DL, MVT::i32));
8639 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8640 } else if (Attrs.hasAgnosticZAInterface()) {
8641 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8644 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8645 TargetLowering::CallLoweringInfo CLI(DAG);
8646 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8647 getLibcallCallingConv(LC), RetTy, Callee, {});
8648 std::tie(Size, Chain) = LowerCallTo(CLI);
8649 }
8650 if (Size) {
8651 SDValue Buffer = DAG.getNode(
8652 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8653 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8654 Chain = Buffer.getValue(1);
8655
8656 Register BufferPtr =
8657 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8658 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8659 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8660 DAG.getVTList(MVT::Other), Chain);
8661 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8662 MFI.CreateVariableSizedObject(Align(16), nullptr);
8663 }
8664 }
8665 } else {
8666 // Old SME ABI lowering (deprecated):
8667 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8668 // will be expanded and stored in the static object later using a
8669 // pseudonode.
8670 if (Attrs.hasZAState()) {
8671 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8672 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8673 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8674 DAG.getConstant(1, DL, MVT::i32));
8675 SDValue Buffer;
8676 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8677 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8678 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8679 } else {
8680 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8681 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8682 DAG.getVTList(MVT::i64, MVT::Other),
8683 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8684 MFI.CreateVariableSizedObject(Align(16), nullptr);
8685 }
8686 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8687 DAG.getConstant(1, DL, MVT::i32));
8688 Chain = DAG.getNode(
8689 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8690 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8691 /*Num save slices*/ NumZaSaveSlices});
8692 } else if (Attrs.hasAgnosticZAInterface()) {
8693 // Call __arm_sme_state_size().
8694 SDValue BufferSize =
8695 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8696 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8697 Chain = BufferSize.getValue(1);
8698 SDValue Buffer;
8699 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8700 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8701 DAG.getVTList(MVT::i64, MVT::Other),
8702 {Chain, BufferSize});
8703 } else {
8704 // Allocate space dynamically.
8705 Buffer = DAG.getNode(
8706 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8707 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8708 MFI.CreateVariableSizedObject(Align(16), nullptr);
8709 }
8710 // Copy the value to a virtual register, and save that in FuncInfo.
8711 Register BufferPtr =
8712 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8713 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8714 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
8715 }
8716 }
8717
8718 if (CallConv == CallingConv::PreserveNone) {
8719 for (const ISD::InputArg &I : Ins) {
8720 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8721 I.Flags.isSwiftAsync()) {
8722 MachineFunction &MF = DAG.getMachineFunction();
8723 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8724 MF.getFunction(),
8725 "Swift attributes can't be used with preserve_none",
8726 DL.getDebugLoc()));
8727 break;
8728 }
8729 }
8730 }
8731
8732 if (getTM().useNewSMEABILowering()) {
8733 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8734 if (Attrs.isNewZT0())
8735 Chain = DAG.getNode(
8736 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8737 DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8738 DAG.getTargetConstant(0, DL, MVT::i32));
8739 }
8740
8741 return Chain;
8742}
8743
8744void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8745 SelectionDAG &DAG,
8746 const SDLoc &DL,
8747 SDValue &Chain) const {
8748 MachineFunction &MF = DAG.getMachineFunction();
8749 MachineFrameInfo &MFI = MF.getFrameInfo();
8750 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8751 auto PtrVT = getPointerTy(DAG.getDataLayout());
8752 Function &F = MF.getFunction();
8753 bool IsWin64 =
8754 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8755
8757
8759 unsigned NumGPRArgRegs = GPRArgRegs.size();
8760 if (Subtarget->isWindowsArm64EC()) {
8761 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8762 // functions.
8763 NumGPRArgRegs = 4;
8764 }
8765 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8766
8767 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8768 int GPRIdx = 0;
8769 if (GPRSaveSize != 0) {
8770 if (IsWin64) {
8771 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8772 if (GPRSaveSize & 15)
8773 // The extra size here, if triggered, will always be 8.
8774 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8775 } else
8776 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8777
8778 SDValue FIN;
8779 if (Subtarget->isWindowsArm64EC()) {
8780 // With the Arm64EC ABI, we reserve the save area as usual, but we
8781 // compute its address relative to x4. For a normal AArch64->AArch64
8782 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8783 // different address.
8784 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8785 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8786 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8787 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8788 } else {
8789 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8790 }
8791
8792 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8793 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8794 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8795 SDValue Store =
8796 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8798 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8799 : MachinePointerInfo::getStack(MF, i * 8));
8800 MemOps.push_back(Store);
8801 FIN =
8802 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8803 }
8804 }
8805 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8806 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8807
8808 if (Subtarget->hasFPARMv8() && !IsWin64) {
8810 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8811 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8812
8813 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8814 int FPRIdx = 0;
8815 if (FPRSaveSize != 0) {
8816 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8817
8818 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8819
8820 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8821 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8822 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8823
8824 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8825 MachinePointerInfo::getStack(MF, i * 16));
8826 MemOps.push_back(Store);
8827 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8828 DAG.getConstant(16, DL, PtrVT));
8829 }
8830 }
8831 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8832 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8833 }
8834
8835 if (!MemOps.empty()) {
8836 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8837 }
8838}
8839
8840/// LowerCallResult - Lower the result values of a call into the
8841/// appropriate copies out of appropriate physical registers.
8842SDValue AArch64TargetLowering::LowerCallResult(
8843 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8844 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8845 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8846 SDValue ThisVal, bool RequiresSMChange) const {
8847 DenseMap<unsigned, SDValue> CopiedRegs;
8848 // Copy all of the result registers out of their specified physreg.
8849 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8850 CCValAssign VA = RVLocs[i];
8851
8852 // Pass 'this' value directly from the argument to return value, to avoid
8853 // reg unit interference
8854 if (i == 0 && isThisReturn) {
8855 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8856 "unexpected return calling convention register assignment");
8857 InVals.push_back(ThisVal);
8858 continue;
8859 }
8860
8861 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8862 // allows one use of a physreg per block.
8863 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8864 if (!Val) {
8865 Val =
8866 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8867 Chain = Val.getValue(1);
8868 InGlue = Val.getValue(2);
8869 CopiedRegs[VA.getLocReg()] = Val;
8870 }
8871
8872 switch (VA.getLocInfo()) {
8873 default:
8874 llvm_unreachable("Unknown loc info!");
8875 case CCValAssign::Full:
8876 break;
8877 case CCValAssign::BCvt:
8878 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8879 break;
8881 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8882 DAG.getConstant(32, DL, VA.getLocVT()));
8883 [[fallthrough]];
8884 case CCValAssign::AExt:
8885 [[fallthrough]];
8886 case CCValAssign::ZExt:
8887 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8888 break;
8889 }
8890
8891 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8892 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8893 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8894
8895 InVals.push_back(Val);
8896 }
8897
8898 return Chain;
8899}
8900
8901/// Return true if the calling convention is one that we can guarantee TCO for.
8902static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8903 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8905}
8906
8907/// Return true if we might ever do TCO for calls with this calling convention.
8909 switch (CC) {
8910 case CallingConv::C:
8915 case CallingConv::Swift:
8917 case CallingConv::Tail:
8918 case CallingConv::Fast:
8919 return true;
8920 default:
8921 return false;
8922 }
8923}
8924
8925/// Return true if the call convention supports varargs
8926/// Currently only those that pass varargs like the C
8927/// calling convention does are eligible
8928/// Calling conventions listed in this function must also
8929/// be properly handled in AArch64Subtarget::isCallingConvWin64
8931 switch (CC) {
8932 case CallingConv::C:
8934 // SVE vector call is only partially supported, but it should
8935 // support named arguments being passed. Any arguments being passed
8936 // as varargs, are still unsupported.
8938 return true;
8939 default:
8940 return false;
8941 }
8942}
8943
8945 const AArch64Subtarget *Subtarget,
8947 CCState &CCInfo) {
8948 const SelectionDAG &DAG = CLI.DAG;
8949 CallingConv::ID CalleeCC = CLI.CallConv;
8950 bool IsVarArg = CLI.IsVarArg;
8951 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8952 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8953
8954 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8955 // for the shadow store.
8956 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8957 CCInfo.AllocateStack(32, Align(16));
8958
8959 unsigned NumArgs = Outs.size();
8960 for (unsigned i = 0; i != NumArgs; ++i) {
8961 MVT ArgVT = Outs[i].VT;
8962 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8963
8964 bool UseVarArgCC = false;
8965 if (IsVarArg) {
8966 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8967 // too, so use the vararg CC to force them to integer registers.
8968 if (IsCalleeWin64) {
8969 UseVarArgCC = true;
8970 } else {
8971 UseVarArgCC = ArgFlags.isVarArg();
8972 }
8973 }
8974
8975 if (!UseVarArgCC) {
8976 // Get type of the original argument.
8977 EVT ActualVT =
8978 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8979 /*AllowUnknown*/ true);
8980 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8981 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8982 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8983 ArgVT = MVT::i8;
8984 else if (ActualMVT == MVT::i16)
8985 ArgVT = MVT::i16;
8986 }
8987
8988 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8989 // argument. This logic should exactly mirror LowerFormalArguments.
8990 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8991 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8992 Outs[i].OrigTy, CCInfo);
8993 assert(!Res && "Call operand has unhandled type");
8994 (void)Res;
8995 }
8996}
8997
8998static SMECallAttrs
9001 if (CLI.CB)
9002 return SMECallAttrs(*CLI.CB, &TLI);
9003 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9004 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
9006}
9007
9008bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9009 const CallLoweringInfo &CLI) const {
9010 CallingConv::ID CalleeCC = CLI.CallConv;
9011 if (!mayTailCallThisCC(CalleeCC))
9012 return false;
9013
9014 SDValue Callee = CLI.Callee;
9015 bool IsVarArg = CLI.IsVarArg;
9016 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9017 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9018 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9019 const SelectionDAG &DAG = CLI.DAG;
9020 MachineFunction &MF = DAG.getMachineFunction();
9021 const Function &CallerF = MF.getFunction();
9022 CallingConv::ID CallerCC = CallerF.getCallingConv();
9023
9024 // SME Streaming functions are not eligible for TCO as they may require
9025 // the streaming mode or ZA to be restored after returning from the call.
9026 SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
9027 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9028 CallAttrs.requiresPreservingAllZAState() ||
9029 CallAttrs.caller().hasStreamingBody())
9030 return false;
9031
9032 // Functions using the C or Fast calling convention that have an SVE signature
9033 // preserve more registers and should assume the SVE_VectorCall CC.
9034 // The check for matching callee-saved regs will determine whether it is
9035 // eligible for TCO.
9036 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9037 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9039
9040 bool CCMatch = CallerCC == CalleeCC;
9041
9042 // When using the Windows calling convention on a non-windows OS, we want
9043 // to back up and restore X18 in such functions; we can't do a tail call
9044 // from those functions.
9045 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9046 CalleeCC != CallingConv::Win64)
9047 return false;
9048
9049 // Byval parameters hand the function a pointer directly into the stack area
9050 // we want to reuse during a tail call. Working around this *is* possible (see
9051 // X86) but less efficient and uglier in LowerCall.
9052 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9053 e = CallerF.arg_end();
9054 i != e; ++i) {
9055 if (i->hasByValAttr())
9056 return false;
9057
9058 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9059 // In this case, it is necessary to save X0/X1 in the callee and return it
9060 // in X0. Tail call opt may interfere with this, so we disable tail call
9061 // opt when the caller has an "inreg" attribute -- except if the callee
9062 // also has that attribute on the same argument, and the same value is
9063 // passed.
9064 if (i->hasInRegAttr()) {
9065 unsigned ArgIdx = i - CallerF.arg_begin();
9066 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9067 return false;
9068 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9069 if (!Attrs.hasAttribute(Attribute::InReg) ||
9070 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9071 CLI.CB->getArgOperand(ArgIdx) != i) {
9072 return false;
9073 }
9074 }
9075 }
9076
9077 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9078 return CCMatch;
9079
9080 // Externally-defined functions with weak linkage should not be
9081 // tail-called on AArch64 when the OS does not support dynamic
9082 // pre-emption of symbols, as the AAELF spec requires normal calls
9083 // to undefined weak functions to be replaced with a NOP or jump to the
9084 // next instruction. The behaviour of branch instructions in this
9085 // situation (as used for tail calls) is implementation-defined, so we
9086 // cannot rely on the linker replacing the tail call with a return.
9087 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9088 const GlobalValue *GV = G->getGlobal();
9089 const Triple &TT = getTargetMachine().getTargetTriple();
9090 if (GV->hasExternalWeakLinkage() &&
9091 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9092 return false;
9093 }
9094
9095 // Now we search for cases where we can use a tail call without changing the
9096 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9097 // concept.
9098
9099 // I want anyone implementing a new calling convention to think long and hard
9100 // about this assert.
9101 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9102 report_fatal_error("Unsupported variadic calling convention");
9103
9104 LLVMContext &C = *DAG.getContext();
9105 // Check that the call results are passed in the same way.
9106 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9107 CCAssignFnForCall(CalleeCC, IsVarArg),
9108 CCAssignFnForCall(CallerCC, IsVarArg)))
9109 return false;
9110 // The callee has to preserve all registers the caller needs to preserve.
9111 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9112 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9113 if (!CCMatch) {
9114 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9115 if (Subtarget->hasCustomCallingConv()) {
9116 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9117 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9118 }
9119 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9120 return false;
9121 }
9122
9123 // Nothing more to check if the callee is taking no arguments
9124 if (Outs.empty())
9125 return true;
9126
9128 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9129
9130 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9131
9132 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9133 // When we are musttail, additional checks have been done and we can safely ignore this check
9134 // At least two cases here: if caller is fastcc then we can't have any
9135 // memory arguments (we'd be expected to clean up the stack afterwards). If
9136 // caller is C then we could potentially use its argument area.
9137
9138 // FIXME: for now we take the most conservative of these in both cases:
9139 // disallow all variadic memory operands.
9140 for (const CCValAssign &ArgLoc : ArgLocs)
9141 if (!ArgLoc.isRegLoc())
9142 return false;
9143 }
9144
9145 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9146
9147 // If any of the arguments is passed indirectly, it must be SVE, so the
9148 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9149 // allocate space on the stack. That is why we determine this explicitly here
9150 // the call cannot be a tailcall.
9151 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9152 assert((A.getLocInfo() != CCValAssign::Indirect ||
9153 A.getValVT().isScalableVector() ||
9154 Subtarget->isWindowsArm64EC()) &&
9155 "Expected value to be scalable");
9156 return A.getLocInfo() == CCValAssign::Indirect;
9157 }))
9158 return false;
9159
9160 // If the stack arguments for this call do not fit into our own save area then
9161 // the call cannot be made tail.
9162 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9163 return false;
9164
9165 const MachineRegisterInfo &MRI = MF.getRegInfo();
9166 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9167 return false;
9168
9169 return true;
9170}
9171
9172SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9173 SelectionDAG &DAG,
9174 MachineFrameInfo &MFI,
9175 int ClobberedFI) const {
9176 SmallVector<SDValue, 8> ArgChains;
9177 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9178 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9179
9180 // Include the original chain at the beginning of the list. When this is
9181 // used by target LowerCall hooks, this helps legalize find the
9182 // CALLSEQ_BEGIN node.
9183 ArgChains.push_back(Chain);
9184
9185 // Add a chain value for each stack argument corresponding
9186 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9187 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9188 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9189 if (FI->getIndex() < 0) {
9190 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9191 int64_t InLastByte = InFirstByte;
9192 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9193
9194 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9195 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9196 ArgChains.push_back(SDValue(L, 1));
9197 }
9198
9199 // Build a tokenfactor for all the chains.
9200 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9201}
9202
9203bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9204 bool TailCallOpt) const {
9205 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9206 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9207}
9208
9209// Check if the value is zero-extended from i1 to i8
9210static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9211 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9212 if (SizeInBits < 8)
9213 return false;
9214
9215 APInt RequiredZero(SizeInBits, 0xFE);
9216 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9217 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9218 return ZExtBool;
9219}
9220
9221void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9222 SDNode *Node) const {
9223 // Live-in physreg copies that are glued to SMSTART are applied as
9224 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9225 // register allocator to pass call args in callee saved regs, without extra
9226 // copies to avoid these fake clobbers of actually-preserved GPRs.
9227 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9228 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9229 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9230 if (MachineOperand &MO = MI.getOperand(I);
9231 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9232 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9233 AArch64::GPR64RegClass.contains(MO.getReg())))
9234 MI.removeOperand(I);
9235
9236 // The SVE vector length can change when entering/leaving streaming mode.
9237 // FPMR is set to 0 when entering/leaving streaming mode.
9238 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9239 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9240 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9241 /*IsImplicit=*/true));
9242 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9243 /*IsImplicit=*/true));
9244 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9245 /*IsImplicit=*/true));
9246 }
9247 }
9248
9249 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9250 // have nothing to do with VG, were it not that they are used to materialise a
9251 // frame-address. If they contain a frame-index to a scalable vector, this
9252 // will likely require an ADDVL instruction to materialise the address, thus
9253 // reading VG.
9254 const MachineFunction &MF = *MI.getMF();
9255 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9256 (MI.getOpcode() == AArch64::ADDXri ||
9257 MI.getOpcode() == AArch64::SUBXri)) {
9258 const MachineOperand &MO = MI.getOperand(1);
9259 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9260 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9261 /*IsImplicit=*/true));
9262 }
9263}
9264
9266 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9267 unsigned Condition, bool InsertVectorLengthCheck) const {
9270 FuncInfo->setHasStreamingModeChanges(true);
9271
9272 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9273 SmallVector<SDValue, 2> Ops = {Chain};
9274 if (InGlue)
9275 Ops.push_back(InGlue);
9276 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9277 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9278 };
9279
9280 if (InsertVectorLengthCheck && Enable) {
9281 // Non-streaming -> Streaming
9282 // Insert vector length check before smstart
9283 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9284 Chain = CheckVL.getValue(0);
9285 InGlue = CheckVL.getValue(1);
9286 }
9287
9288 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9289 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9290 SDValue MSROp =
9291 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9292 SmallVector<SDValue> Ops = {Chain, MSROp};
9293 unsigned Opcode;
9294 if (Condition != AArch64SME::Always) {
9295 Register PStateReg = FuncInfo->getPStateSMReg();
9296 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9297 SDValue PStateSM =
9298 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9299 // Use chain and glue from the CopyFromReg.
9300 Ops[0] = PStateSM.getValue(1);
9301 InGlue = PStateSM.getValue(2);
9302 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9303 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9304 Ops.push_back(ConditionOp);
9305 Ops.push_back(PStateSM);
9306 } else {
9307 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9308 }
9309 Ops.push_back(RegMask);
9310
9311 if (InGlue)
9312 Ops.push_back(InGlue);
9313
9314 SDValue SMChange =
9315 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9316
9317 if (!InsertVectorLengthCheck || Enable)
9318 return SMChange;
9319
9320 // Streaming -> Non-streaming
9321 // Insert vector length check after smstop since we cannot read VL
9322 // in streaming mode
9323 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9324}
9325
9328 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9329 CallAttrs.caller().hasStreamingBody())
9330 return AArch64SME::Always;
9331 if (CallAttrs.callee().hasNonStreamingInterface())
9333 if (CallAttrs.callee().hasStreamingInterface())
9335
9336 llvm_unreachable("Unsupported attributes");
9337}
9338
9339/// Check whether a stack argument requires lowering in a tail call.
9341 const CCValAssign &VA, SDValue Arg,
9342 ISD::ArgFlagsTy Flags, int CallOffset) {
9343 // FIXME: We should be able to handle this case, but it's not clear how to.
9344 if (Flags.isZExt() || Flags.isSExt())
9345 return true;
9346
9347 for (;;) {
9348 // Look through nodes that don't alter the bits of the incoming value.
9349 unsigned Op = Arg.getOpcode();
9350 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9351 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9352 Arg = Arg.getOperand(0);
9353 continue;
9354 }
9355 break;
9356 }
9357
9358 // If the argument is a load from the same immutable stack slot, we can reuse
9359 // it.
9360 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9361 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9362 const MachineFrameInfo &MFI = MF.getFrameInfo();
9363 int FI = FINode->getIndex();
9364 if (!MFI.isImmutableObjectIndex(FI))
9365 return true;
9366 if (CallOffset != MFI.getObjectOffset(FI))
9367 return true;
9368 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9369 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9370 return true;
9371 return false;
9372 }
9373 }
9374
9375 return true;
9376}
9377
9378/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9379/// and add input and output parameter nodes.
9380SDValue
9381AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9382 SmallVectorImpl<SDValue> &InVals) const {
9383 SelectionDAG &DAG = CLI.DAG;
9384 SDLoc &DL = CLI.DL;
9385 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9386 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9388 SDValue Chain = CLI.Chain;
9389 SDValue Callee = CLI.Callee;
9390 bool &IsTailCall = CLI.IsTailCall;
9391 CallingConv::ID &CallConv = CLI.CallConv;
9392 bool IsVarArg = CLI.IsVarArg;
9393 const CallBase *CB = CLI.CB;
9394
9395 MachineFunction &MF = DAG.getMachineFunction();
9396 MachineFunction::CallSiteInfo CSInfo;
9397 bool IsThisReturn = false;
9398
9399 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9400 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9401 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9402 bool IsSibCall = false;
9403 bool GuardWithBTI = false;
9404
9405 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9406 !Subtarget->noBTIAtReturnTwice()) {
9407 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9408 }
9409
9410 // Analyze operands of the call, assigning locations to each operand.
9412 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9413
9414 if (IsVarArg) {
9415 unsigned NumArgs = Outs.size();
9416
9417 for (unsigned i = 0; i != NumArgs; ++i) {
9418 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9419 report_fatal_error("Passing SVE types to variadic functions is "
9420 "currently not supported");
9421 }
9422 }
9423
9424 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9425
9426 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9427 // Assign locations to each value returned by this call.
9429 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9430 *DAG.getContext());
9431 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9432
9433 // Set type id for call site info.
9434 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9435 CSInfo = MachineFunction::CallSiteInfo(*CB);
9436
9437 // Check callee args/returns for SVE registers and set calling convention
9438 // accordingly.
9439 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9440 auto HasSVERegLoc = [](CCValAssign &Loc) {
9441 if (!Loc.isRegLoc())
9442 return false;
9443 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9444 AArch64::PPRRegClass.contains(Loc.getLocReg());
9445 };
9446 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9448 }
9449
9450 // Determine whether we need any streaming mode changes.
9451 SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
9452
9453 std::optional<unsigned> ZAMarkerNode;
9454 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9455
9456 if (UseNewSMEABILowering) {
9457 if (CallAttrs.requiresLazySave() ||
9458 CallAttrs.requiresPreservingAllZAState())
9459 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9460 else if (CallAttrs.caller().hasZAState() ||
9461 CallAttrs.caller().hasZT0State())
9462 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9463 }
9464
9465 if (IsTailCall) {
9466 // Check if it's really possible to do a tail call.
9467 IsTailCall = isEligibleForTailCallOptimization(CLI);
9468
9469 // A sibling call is one where we're under the usual C ABI and not planning
9470 // to change that but can still do a tail call:
9471 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9472 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9473 IsSibCall = true;
9474
9475 if (IsTailCall)
9476 ++NumTailCalls;
9477 }
9478
9479 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9480 report_fatal_error("failed to perform tail call elimination on a call "
9481 "site marked musttail");
9482
9483 // Get a count of how many bytes are to be pushed on the stack.
9484 unsigned NumBytes = CCInfo.getStackSize();
9485
9486 if (IsSibCall) {
9487 // Since we're not changing the ABI to make this a tail call, the memory
9488 // operands are already available in the caller's incoming argument space.
9489 NumBytes = 0;
9490 }
9491
9492 // FPDiff is the byte offset of the call's argument area from the callee's.
9493 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9494 // by this amount for a tail call. In a sibling call it must be 0 because the
9495 // caller will deallocate the entire stack and the callee still expects its
9496 // arguments to begin at SP+0. Completely unused for non-tail calls.
9497 int FPDiff = 0;
9498
9499 if (IsTailCall && !IsSibCall) {
9500 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9501
9502 // Since callee will pop argument stack as a tail call, we must keep the
9503 // popped size 16-byte aligned.
9504 NumBytes = alignTo(NumBytes, 16);
9505
9506 // FPDiff will be negative if this tail call requires more space than we
9507 // would automatically have in our incoming argument space. Positive if we
9508 // can actually shrink the stack.
9509 FPDiff = NumReusableBytes - NumBytes;
9510
9511 // Update the required reserved area if this is the tail call requiring the
9512 // most argument stack space.
9513 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9514 FuncInfo->setTailCallReservedStack(-FPDiff);
9515
9516 // The stack pointer must be 16-byte aligned at all times it's used for a
9517 // memory operation, which in practice means at *all* times and in
9518 // particular across call boundaries. Therefore our own arguments started at
9519 // a 16-byte aligned SP and the delta applied for the tail call should
9520 // satisfy the same constraint.
9521 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9522 }
9523
9524 auto DescribeCallsite =
9525 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9526 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9527 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9528 R << ore::NV("Callee", ES->getSymbol());
9529 else if (CLI.CB && CLI.CB->getCalledFunction())
9530 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9531 else
9532 R << "unknown callee";
9533 R << "'";
9534 return R;
9535 };
9536
9537 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9538 bool RequiresSaveAllZA =
9539 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9540 if (RequiresLazySave) {
9541 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9542 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9543 TPIDR2.FrameIndex,
9545 Chain = DAG.getNode(
9546 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9547 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9548 TPIDR2ObjAddr);
9549 OptimizationRemarkEmitter ORE(&MF.getFunction());
9550 ORE.emit([&]() {
9551 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9552 CLI.CB)
9553 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9554 &MF.getFunction());
9555 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9556 });
9557 } else if (RequiresSaveAllZA) {
9558 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9559 "Cannot share state that may not exist");
9560 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9561 /*IsSave=*/true);
9562 }
9563
9564 bool RequiresSMChange = CallAttrs.requiresSMChange();
9565 if (RequiresSMChange) {
9566 OptimizationRemarkEmitter ORE(&MF.getFunction());
9567 ORE.emit([&]() {
9568 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9569 CLI.CB)
9570 : OptimizationRemarkAnalysis("sme", "SMETransition",
9571 &MF.getFunction());
9572 DescribeCallsite(R) << " requires a streaming mode transition";
9573 return R;
9574 });
9575 }
9576
9577 SDValue ZTFrameIdx;
9578 MachineFrameInfo &MFI = MF.getFrameInfo();
9579 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9580
9581 // If the caller has ZT0 state which will not be preserved by the callee,
9582 // spill ZT0 before the call.
9583 if (ShouldPreserveZT0) {
9584 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9585
9586 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9587 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9588 }
9589
9590 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9591 // PSTATE.ZA before the call if there is no lazy-save active.
9592 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9593 assert((!DisableZA || !RequiresLazySave) &&
9594 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9595
9596 if (DisableZA)
9597 Chain = DAG.getNode(
9598 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9599 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9600
9601 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9602 // These operations are automatically eliminated by the prolog/epilog pass
9603 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9604 if (!IsSibCall) {
9605 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9606 if (ZAMarkerNode) {
9607 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9608 // using a chain can result in incorrect scheduling. The markers refer to
9609 // the position just before the CALLSEQ_START (though occur after as
9610 // CALLSEQ_START lacks in-glue).
9611 Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
9612 {Chain, Chain.getValue(1)});
9613 }
9614 }
9615
9616 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9618
9620 SmallSet<unsigned, 8> RegsUsed;
9621 SmallVector<SDValue, 8> MemOpChains;
9622 auto PtrVT = getPointerTy(DAG.getDataLayout());
9623
9624 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9625 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9626 for (const auto &F : Forwards) {
9627 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9628 RegsToPass.emplace_back(F.PReg, Val);
9629 }
9630 }
9631
9632 // Walk the register/memloc assignments, inserting copies/loads.
9633 unsigned ExtraArgLocs = 0;
9634 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9635 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9636 SDValue Arg = OutVals[i];
9637 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9638
9639 // Promote the value if needed.
9640 switch (VA.getLocInfo()) {
9641 default:
9642 llvm_unreachable("Unknown loc info!");
9643 case CCValAssign::Full:
9644 break;
9645 case CCValAssign::SExt:
9646 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9647 break;
9648 case CCValAssign::ZExt:
9649 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9650 break;
9651 case CCValAssign::AExt:
9652 if (Outs[i].ArgVT == MVT::i1) {
9653 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9654 //
9655 // Check if we actually have to do this, because the value may
9656 // already be zero-extended.
9657 //
9658 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9659 // and rely on DAGCombiner to fold this, because the following
9660 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9661 //
9662 // (ext (zext x)) -> (zext x)
9663 //
9664 // This will give us (zext i32), which we cannot remove, so
9665 // try to check this beforehand.
9666 if (!checkZExtBool(Arg, DAG)) {
9667 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9668 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9669 }
9670 }
9671 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9672 break;
9674 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9675 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9676 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9677 DAG.getConstant(32, DL, VA.getLocVT()));
9678 break;
9679 case CCValAssign::BCvt:
9680 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9681 break;
9682 case CCValAssign::Trunc:
9683 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9684 break;
9685 case CCValAssign::FPExt:
9686 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9687 break;
9689 bool isScalable = VA.getValVT().isScalableVT();
9690 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9691 "Indirect arguments should be scalable on most subtargets");
9692
9693 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9694 uint64_t PartSize = StoreSize;
9695 unsigned NumParts = 1;
9696 if (Outs[i].Flags.isInConsecutiveRegs()) {
9697 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9698 ++NumParts;
9699 StoreSize *= NumParts;
9700 }
9701
9702 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9703 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9704 MachineFrameInfo &MFI = MF.getFrameInfo();
9705 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9706 if (isScalable) {
9707 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
9708 VA.getValVT().getVectorElementType() == MVT::i1;
9711 }
9712
9713 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9716 SDValue SpillSlot = Ptr;
9717
9718 // Ensure we generate all stores for each tuple part, whilst updating the
9719 // pointer after each store correctly using vscale.
9720 while (NumParts) {
9721 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9722 MemOpChains.push_back(Store);
9723
9724 NumParts--;
9725 if (NumParts > 0) {
9726 SDValue BytesIncrement;
9727 if (isScalable) {
9728 BytesIncrement = DAG.getVScale(
9729 DL, Ptr.getValueType(),
9730 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9731 } else {
9732 BytesIncrement = DAG.getConstant(
9733 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9734 Ptr.getValueType());
9735 }
9736 MPI = MachinePointerInfo(MPI.getAddrSpace());
9737 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9738 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9739 ExtraArgLocs++;
9740 i++;
9741 }
9742 }
9743
9744 Arg = SpillSlot;
9745 break;
9746 }
9747
9748 if (VA.isRegLoc()) {
9749 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9750 Outs[0].VT == MVT::i64) {
9751 assert(VA.getLocVT() == MVT::i64 &&
9752 "unexpected calling convention register assignment");
9753 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9754 "unexpected use of 'returned'");
9755 IsThisReturn = true;
9756 }
9757 if (RegsUsed.count(VA.getLocReg())) {
9758 // If this register has already been used then we're trying to pack
9759 // parts of an [N x i32] into an X-register. The extension type will
9760 // take care of putting the two halves in the right place but we have to
9761 // combine them.
9762 SDValue &Bits =
9763 llvm::find_if(RegsToPass,
9764 [=](const std::pair<unsigned, SDValue> &Elt) {
9765 return Elt.first == VA.getLocReg();
9766 })
9767 ->second;
9768 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9769 // Call site info is used for function's parameter entry value
9770 // tracking. For now we track only simple cases when parameter
9771 // is transferred through whole register.
9773 [&VA](MachineFunction::ArgRegPair ArgReg) {
9774 return ArgReg.Reg == VA.getLocReg();
9775 });
9776 } else {
9777 // Add an extra level of indirection for streaming mode changes by
9778 // using a pseudo copy node that cannot be rematerialised between a
9779 // smstart/smstop and the call by the simple register coalescer.
9780 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9781 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9782 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9783 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9784 RegsUsed.insert(VA.getLocReg());
9785 const TargetOptions &Options = DAG.getTarget().Options;
9786 if (Options.EmitCallSiteInfo)
9787 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9788 }
9789 } else {
9790 assert(VA.isMemLoc());
9791
9792 SDValue DstAddr;
9793 MachinePointerInfo DstInfo;
9794
9795 // FIXME: This works on big-endian for composite byvals, which are the
9796 // common case. It should also work for fundamental types too.
9797 uint32_t BEAlign = 0;
9798 unsigned OpSize;
9799 if (VA.getLocInfo() == CCValAssign::Indirect ||
9801 OpSize = VA.getLocVT().getFixedSizeInBits();
9802 else
9803 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9804 : VA.getValVT().getSizeInBits();
9805 OpSize = (OpSize + 7) / 8;
9806 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9807 !Flags.isInConsecutiveRegs()) {
9808 if (OpSize < 8)
9809 BEAlign = 8 - OpSize;
9810 }
9811 unsigned LocMemOffset = VA.getLocMemOffset();
9812 int32_t Offset = LocMemOffset + BEAlign;
9813
9814 if (IsTailCall) {
9815 // When the frame pointer is perfectly aligned for the tail call and the
9816 // same stack argument is passed down intact, we can reuse it.
9817 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9818 continue;
9819
9820 Offset = Offset + FPDiff;
9821 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9822
9823 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9824 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9825
9826 // Make sure any stack arguments overlapping with where we're storing
9827 // are loaded before this eventual operation. Otherwise they'll be
9828 // clobbered.
9829 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9830 } else {
9831 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9832
9833 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9834 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9835 }
9836
9837 if (Outs[i].Flags.isByVal()) {
9838 SDValue SizeNode =
9839 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9840 SDValue Cpy = DAG.getMemcpy(
9841 Chain, DL, DstAddr, Arg, SizeNode,
9842 Outs[i].Flags.getNonZeroByValAlign(),
9843 /*isVol = */ false, /*AlwaysInline = */ false,
9844 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9845
9846 MemOpChains.push_back(Cpy);
9847 } else {
9848 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9849 // promoted to a legal register type i32, we should truncate Arg back to
9850 // i1/i8/i16.
9851 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9852 VA.getValVT() == MVT::i16)
9853 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9854
9855 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9856 MemOpChains.push_back(Store);
9857 }
9858 }
9859 }
9860
9861 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9862 !(CLI.CB && CLI.CB->isMustTailCall())) {
9863 SDValue ParamPtr = StackPtr;
9864 if (IsTailCall) {
9865 // Create a dummy object at the top of the stack that can be used to get
9866 // the SP after the epilogue
9867 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9868 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9869 }
9870
9871 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9872 // describing the argument list. x4 contains the address of the
9873 // first stack parameter. x5 contains the size in bytes of all parameters
9874 // passed on the stack.
9875 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9876 RegsToPass.emplace_back(AArch64::X5,
9877 DAG.getConstant(NumBytes, DL, MVT::i64));
9878 }
9879
9880 if (!MemOpChains.empty())
9881 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9882
9883 SDValue InGlue;
9884 if (RequiresSMChange) {
9885 bool InsertVectorLengthCheck =
9887 Chain = changeStreamingMode(
9888 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9889 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
9890 InGlue = Chain.getValue(1);
9891 }
9892
9893 // Build a sequence of copy-to-reg nodes chained together with token chain
9894 // and flag operands which copy the outgoing args into the appropriate regs.
9895 for (auto &RegToPass : RegsToPass) {
9896 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9897 RegToPass.second, InGlue);
9898 InGlue = Chain.getValue(1);
9899 }
9900
9901 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9902 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9903 // node so that legalize doesn't hack it.
9904 const GlobalValue *CalledGlobal = nullptr;
9905 unsigned OpFlags = 0;
9906 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9907 CalledGlobal = G->getGlobal();
9908 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9910 if (OpFlags & AArch64II::MO_GOT) {
9911 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9912 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9913 } else {
9914 const GlobalValue *GV = G->getGlobal();
9915 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9916 }
9917 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9918 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9919 Subtarget->isTargetMachO()) ||
9921 const char *Sym = S->getSymbol();
9922 if (UseGot) {
9924 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9925 } else {
9926 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9927 }
9928 }
9929
9930 // We don't usually want to end the call-sequence here because we would tidy
9931 // the frame up *after* the call, however in the ABI-changing tail-call case
9932 // we've carefully laid out the parameters so that when sp is reset they'll be
9933 // in the correct location.
9934 if (IsTailCall && !IsSibCall) {
9935 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9936 InGlue = Chain.getValue(1);
9937 }
9938
9939 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9940
9941 std::vector<SDValue> Ops;
9942 Ops.push_back(Chain);
9943 Ops.push_back(Callee);
9944
9945 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9946 // be expanded to the call, directly followed by a special marker sequence and
9947 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9948 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9949 assert(!IsTailCall &&
9950 "tail calls cannot be marked with clang.arc.attachedcall");
9951 Opc = AArch64ISD::CALL_RVMARKER;
9952
9953 // Add a target global address for the retainRV/claimRV runtime function
9954 // just before the call target.
9955 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9956 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9957 Ops.insert(Ops.begin() + 1, GA);
9958
9959 // We may or may not need to emit both the marker and the retain/claim call.
9960 // Tell the pseudo expansion using an additional boolean op.
9961 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9962 SDValue DoEmitMarker =
9963 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9964 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9965 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9966 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9967 } else if (GuardWithBTI) {
9968 Opc = AArch64ISD::CALL_BTI;
9969 }
9970
9971 if (IsTailCall) {
9972 // Each tail call may have to adjust the stack by a different amount, so
9973 // this information must travel along with the operation for eventual
9974 // consumption by emitEpilogue.
9975 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9976 }
9977
9978 if (CLI.PAI) {
9979 const uint64_t Key = CLI.PAI->Key;
9981 "Invalid auth call key");
9982
9983 // Split the discriminator into address/integer components.
9984 SDValue AddrDisc, IntDisc;
9985 std::tie(IntDisc, AddrDisc) =
9986 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9987
9988 if (Opc == AArch64ISD::CALL_RVMARKER)
9989 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9990 else
9991 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9992 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9993 Ops.push_back(IntDisc);
9994 Ops.push_back(AddrDisc);
9995 }
9996
9997 // Add argument registers to the end of the list so that they are known live
9998 // into the call.
9999 for (auto &RegToPass : RegsToPass)
10000 Ops.push_back(DAG.getRegister(RegToPass.first,
10001 RegToPass.second.getValueType()));
10002
10003 // Add a register mask operand representing the call-preserved registers.
10004 const uint32_t *Mask;
10005 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10006 if (IsThisReturn) {
10007 // For 'this' returns, use the X0-preserving mask if applicable
10008 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10009 if (!Mask) {
10010 IsThisReturn = false;
10011 Mask = TRI->getCallPreservedMask(MF, CallConv);
10012 }
10013 } else
10014 Mask = TRI->getCallPreservedMask(MF, CallConv);
10015
10016 if (Subtarget->hasCustomCallingConv())
10017 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10018
10019 if (TRI->isAnyArgRegReserved(MF))
10020 TRI->emitReservedArgRegCallError(MF);
10021
10022 assert(Mask && "Missing call preserved mask for calling convention");
10023 Ops.push_back(DAG.getRegisterMask(Mask));
10024
10025 if (InGlue.getNode())
10026 Ops.push_back(InGlue);
10027
10028 // If we're doing a tall call, use a TC_RETURN here rather than an
10029 // actual call instruction.
10030 if (IsTailCall) {
10032 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10033 if (IsCFICall)
10034 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10035
10036 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10037 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10038 if (CalledGlobal &&
10039 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10040 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10041 return Ret;
10042 }
10043
10044 // Returns a chain and a flag for retval copy to use.
10045 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10046 if (IsCFICall)
10047 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10048
10049 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10050 InGlue = Chain.getValue(1);
10051 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10052 if (CalledGlobal &&
10053 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10054 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10055
10056 uint64_t CalleePopBytes =
10057 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10058
10059 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10060 InGlue = Chain.getValue(1);
10061
10062 // Handle result values, copying them out of physregs into vregs that we
10063 // return.
10064 SDValue Result = LowerCallResult(
10065 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10066 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10067
10068 if (!Ins.empty())
10069 InGlue = Result.getValue(Result->getNumValues() - 1);
10070
10071 if (RequiresSMChange) {
10073 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10074 getSMToggleCondition(CallAttrs));
10075 }
10076
10077 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
10078 // Unconditionally resume ZA.
10079 Result = DAG.getNode(
10080 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10081 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10082
10083 if (ShouldPreserveZT0)
10084 Result =
10085 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10086 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10087
10088 if (RequiresLazySave) {
10089 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10090 } else if (RequiresSaveAllZA) {
10091 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10092 /*IsSave=*/false);
10093 }
10094
10095 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10096 RequiresSaveAllZA) {
10097 for (unsigned I = 0; I < InVals.size(); ++I) {
10098 // The smstart/smstop is chained as part of the call, but when the
10099 // resulting chain is discarded (which happens when the call is not part
10100 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10101 // smstart/smstop is chained to the result value. We can do that by doing
10102 // a vreg -> vreg copy.
10105 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10106 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10107 InVals[I].getValueType());
10108 }
10109 }
10110
10111 if (CallConv == CallingConv::PreserveNone) {
10112 for (const ISD::OutputArg &O : Outs) {
10113 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10114 O.Flags.isSwiftAsync()) {
10115 MachineFunction &MF = DAG.getMachineFunction();
10116 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10117 MF.getFunction(),
10118 "Swift attributes can't be used with preserve_none",
10119 DL.getDebugLoc()));
10120 break;
10121 }
10122 }
10123 }
10124
10125 return Result;
10126}
10127
10128bool AArch64TargetLowering::CanLowerReturn(
10129 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10131 const Type *RetTy) const {
10132 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10134 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10135 return CCInfo.CheckReturn(Outs, RetCC);
10136}
10137
10138SDValue
10139AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10140 bool isVarArg,
10142 const SmallVectorImpl<SDValue> &OutVals,
10143 const SDLoc &DL, SelectionDAG &DAG) const {
10144 auto &MF = DAG.getMachineFunction();
10145 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10146
10147 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10149 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10150 CCInfo.AnalyzeReturn(Outs, RetCC);
10151
10152 // Copy the result values into the output registers.
10153 SDValue Glue;
10155 SmallSet<unsigned, 4> RegsUsed;
10156 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10157 ++i, ++realRVLocIdx) {
10158 CCValAssign &VA = RVLocs[i];
10159 assert(VA.isRegLoc() && "Can only return in registers!");
10160 SDValue Arg = OutVals[realRVLocIdx];
10161
10162 switch (VA.getLocInfo()) {
10163 default:
10164 llvm_unreachable("Unknown loc info!");
10165 case CCValAssign::Full:
10166 if (Outs[i].ArgVT == MVT::i1) {
10167 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10168 // value. This is strictly redundant on Darwin (which uses "zeroext
10169 // i1"), but will be optimised out before ISel.
10170 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10171 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10172 }
10173 break;
10174 case CCValAssign::BCvt:
10175 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10176 break;
10177 case CCValAssign::AExt:
10178 case CCValAssign::ZExt:
10179 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10180 break;
10182 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10183 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10184 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10185 DAG.getConstant(32, DL, VA.getLocVT()));
10186 break;
10187 }
10188
10189 if (RegsUsed.count(VA.getLocReg())) {
10190 SDValue &Bits =
10191 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10192 return Elt.first == VA.getLocReg();
10193 })->second;
10194 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10195 } else {
10196 RetVals.emplace_back(VA.getLocReg(), Arg);
10197 RegsUsed.insert(VA.getLocReg());
10198 }
10199 }
10200
10201 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10202
10203 // Emit SMSTOP before returning from a locally streaming function
10204 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10205 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10206 if (FuncAttrs.hasStreamingCompatibleInterface())
10207 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10208 /*Glue*/ SDValue(),
10210 else
10211 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10212 /*Glue*/ SDValue(), AArch64SME::Always);
10213 Glue = Chain.getValue(1);
10214 }
10215
10216 SmallVector<SDValue, 4> RetOps(1, Chain);
10217 for (auto &RetVal : RetVals) {
10218 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10219 isPassedInFPR(RetVal.second.getValueType()))
10220 RetVal.second =
10221 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10222 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10223 RetVal.second);
10224 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10225 Glue = Chain.getValue(1);
10226 RetOps.push_back(
10227 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10228 }
10229
10230 // Windows AArch64 ABIs require that for returning structs by value we copy
10231 // the sret argument into X0 for the return.
10232 // We saved the argument into a virtual register in the entry block,
10233 // so now we copy the value out and into X0.
10234 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10235 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10237
10238 unsigned RetValReg = AArch64::X0;
10239 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10240 RetValReg = AArch64::X8;
10241 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10242 Glue = Chain.getValue(1);
10243
10244 RetOps.push_back(
10245 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10246 }
10247
10248 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10249 if (I) {
10250 for (; *I; ++I) {
10251 if (AArch64::GPR64RegClass.contains(*I))
10252 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10253 else if (AArch64::FPR64RegClass.contains(*I))
10254 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10255 else
10256 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10257 }
10258 }
10259
10260 RetOps[0] = Chain; // Update chain.
10261
10262 // Add the glue if we have it.
10263 if (Glue.getNode())
10264 RetOps.push_back(Glue);
10265
10266 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10267 // ARM64EC entry thunks use a special return sequence: instead of a regular
10268 // "ret" instruction, they need to explicitly call the emulator.
10269 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10270 SDValue Arm64ECRetDest =
10271 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10272 Arm64ECRetDest =
10273 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10274 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10275 MachinePointerInfo());
10276 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10277 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10278 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10279 }
10280
10281 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10282}
10283
10284//===----------------------------------------------------------------------===//
10285// Other Lowering Code
10286//===----------------------------------------------------------------------===//
10287
10288SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10289 SelectionDAG &DAG,
10290 unsigned Flag) const {
10291 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10292 N->getOffset(), Flag);
10293}
10294
10295SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10296 SelectionDAG &DAG,
10297 unsigned Flag) const {
10298 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10299}
10300
10301SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10302 SelectionDAG &DAG,
10303 unsigned Flag) const {
10304 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10305 N->getOffset(), Flag);
10306}
10307
10308SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10309 SelectionDAG &DAG,
10310 unsigned Flag) const {
10311 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10312}
10313
10314SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10315 SelectionDAG &DAG,
10316 unsigned Flag) const {
10317 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10318}
10319
10320// (loadGOT sym)
10321template <class NodeTy>
10322SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10323 unsigned Flags) const {
10324 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10325 SDLoc DL(N);
10326 EVT Ty = getPointerTy(DAG.getDataLayout());
10327 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10328 // FIXME: Once remat is capable of dealing with instructions with register
10329 // operands, expand this into two nodes instead of using a wrapper node.
10330 if (DAG.getMachineFunction()
10331 .getInfo<AArch64FunctionInfo>()
10332 ->hasELFSignedGOT())
10333 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10334 0);
10335 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10336}
10337
10338// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10339template <class NodeTy>
10340SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10341 unsigned Flags) const {
10342 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10343 SDLoc DL(N);
10344 EVT Ty = getPointerTy(DAG.getDataLayout());
10345 const unsigned char MO_NC = AArch64II::MO_NC;
10346 return DAG.getNode(
10347 AArch64ISD::WrapperLarge, DL, Ty,
10348 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10349 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10350 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10351 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10352}
10353
10354// (addlow (adrp %hi(sym)) %lo(sym))
10355template <class NodeTy>
10356SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10357 unsigned Flags) const {
10358 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10359 SDLoc DL(N);
10360 EVT Ty = getPointerTy(DAG.getDataLayout());
10361 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10362 SDValue Lo = getTargetNode(N, Ty, DAG,
10364 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10365 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10366}
10367
10368// (adr sym)
10369template <class NodeTy>
10370SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10371 unsigned Flags) const {
10372 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10373 SDLoc DL(N);
10374 EVT Ty = getPointerTy(DAG.getDataLayout());
10375 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10376 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10377}
10378
10379SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10380 SelectionDAG &DAG) const {
10381 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10382 const GlobalValue *GV = GN->getGlobal();
10383 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10384
10385 if (OpFlags != AArch64II::MO_NO_FLAG)
10387 "unexpected offset in global node");
10388
10389 // This also catches the large code model case for Darwin, and tiny code
10390 // model with got relocations.
10391 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10392 return getGOT(GN, DAG, OpFlags);
10393 }
10394
10398 Result = getAddrLarge(GN, DAG, OpFlags);
10399 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10400 Result = getAddrTiny(GN, DAG, OpFlags);
10401 } else {
10402 Result = getAddr(GN, DAG, OpFlags);
10403 }
10404 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10405 SDLoc DL(GN);
10407 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10409 return Result;
10410}
10411
10412/// Convert a TLS address reference into the correct sequence of loads
10413/// and calls to compute the variable's address (for Darwin, currently) and
10414/// return an SDValue containing the final node.
10415
10416/// Darwin only has one TLS scheme which must be capable of dealing with the
10417/// fully general situation, in the worst case. This means:
10418/// + "extern __thread" declaration.
10419/// + Defined in a possibly unknown dynamic library.
10420///
10421/// The general system is that each __thread variable has a [3 x i64] descriptor
10422/// which contains information used by the runtime to calculate the address. The
10423/// only part of this the compiler needs to know about is the first xword, which
10424/// contains a function pointer that must be called with the address of the
10425/// entire descriptor in "x0".
10426///
10427/// Since this descriptor may be in a different unit, in general even the
10428/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10429/// is:
10430/// adrp x0, _var@TLVPPAGE
10431/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10432/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10433/// ; the function pointer
10434/// blr x1 ; Uses descriptor address in x0
10435/// ; Address of _var is now in x0.
10436///
10437/// If the address of _var's descriptor *is* known to the linker, then it can
10438/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10439/// a slight efficiency gain.
10440SDValue
10441AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10442 SelectionDAG &DAG) const {
10443 assert(Subtarget->isTargetDarwin() &&
10444 "This function expects a Darwin target");
10445
10446 SDLoc DL(Op);
10447 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10448 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10449 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10450
10451 SDValue TLVPAddr =
10452 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10453 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10454
10455 // The first entry in the descriptor is a function pointer that we must call
10456 // to obtain the address of the variable.
10457 SDValue Chain = DAG.getEntryNode();
10458 SDValue FuncTLVGet = DAG.getLoad(
10459 PtrMemVT, DL, Chain, DescAddr,
10461 Align(PtrMemVT.getSizeInBits() / 8),
10463 Chain = FuncTLVGet.getValue(1);
10464
10465 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10466 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10467
10468 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10469 MFI.setAdjustsStack(true);
10470
10471 // TLS calls preserve all registers except those that absolutely must be
10472 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10473 // silly).
10474 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10475 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10476 if (Subtarget->hasCustomCallingConv())
10477 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10478
10479 // Finally, we can make the call. This is just a degenerate version of a
10480 // normal AArch64 call node: x0 takes the address of the descriptor, and
10481 // returns the address of the variable in this thread.
10482 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10483
10484 unsigned Opcode = AArch64ISD::CALL;
10486 Ops.push_back(Chain);
10487 Ops.push_back(FuncTLVGet);
10488
10489 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10490 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10491 Opcode = AArch64ISD::AUTH_CALL;
10492 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10493 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10494 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10495 }
10496
10497 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10498 Ops.push_back(DAG.getRegisterMask(Mask));
10499 Ops.push_back(Chain.getValue(1));
10500 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10501 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10502}
10503
10504/// Convert a thread-local variable reference into a sequence of instructions to
10505/// compute the variable's address for the local exec TLS model of ELF targets.
10506/// The sequence depends on the maximum TLS area size.
10507SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10508 SDValue ThreadBase,
10509 const SDLoc &DL,
10510 SelectionDAG &DAG) const {
10511 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10512 SDValue TPOff, Addr;
10513
10514 switch (DAG.getTarget().Options.TLSSize) {
10515 default:
10516 llvm_unreachable("Unexpected TLS size");
10517
10518 case 12: {
10519 // mrs x0, TPIDR_EL0
10520 // add x0, x0, :tprel_lo12:a
10522 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10523 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10524 Var,
10525 DAG.getTargetConstant(0, DL, MVT::i32)),
10526 0);
10527 }
10528
10529 case 24: {
10530 // mrs x0, TPIDR_EL0
10531 // add x0, x0, :tprel_hi12:a
10532 // add x0, x0, :tprel_lo12_nc:a
10533 SDValue HiVar = DAG.getTargetGlobalAddress(
10534 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10535 SDValue LoVar = DAG.getTargetGlobalAddress(
10536 GV, DL, PtrVT, 0,
10538 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10539 HiVar,
10540 DAG.getTargetConstant(0, DL, MVT::i32)),
10541 0);
10542 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10543 LoVar,
10544 DAG.getTargetConstant(0, DL, MVT::i32)),
10545 0);
10546 }
10547
10548 case 32: {
10549 // mrs x1, TPIDR_EL0
10550 // movz x0, #:tprel_g1:a
10551 // movk x0, #:tprel_g0_nc:a
10552 // add x0, x1, x0
10553 SDValue HiVar = DAG.getTargetGlobalAddress(
10554 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10555 SDValue LoVar = DAG.getTargetGlobalAddress(
10556 GV, DL, PtrVT, 0,
10558 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10559 DAG.getTargetConstant(16, DL, MVT::i32)),
10560 0);
10561 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10562 DAG.getTargetConstant(0, DL, MVT::i32)),
10563 0);
10564 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10565 }
10566
10567 case 48: {
10568 // mrs x1, TPIDR_EL0
10569 // movz x0, #:tprel_g2:a
10570 // movk x0, #:tprel_g1_nc:a
10571 // movk x0, #:tprel_g0_nc:a
10572 // add x0, x1, x0
10573 SDValue HiVar = DAG.getTargetGlobalAddress(
10574 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10575 SDValue MiVar = DAG.getTargetGlobalAddress(
10576 GV, DL, PtrVT, 0,
10578 SDValue LoVar = DAG.getTargetGlobalAddress(
10579 GV, DL, PtrVT, 0,
10581 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10582 DAG.getTargetConstant(32, DL, MVT::i32)),
10583 0);
10584 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10585 DAG.getTargetConstant(16, DL, MVT::i32)),
10586 0);
10587 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10588 DAG.getTargetConstant(0, DL, MVT::i32)),
10589 0);
10590 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10591 }
10592 }
10593}
10594
10595/// When accessing thread-local variables under either the general-dynamic or
10596/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10597/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10598/// is a function pointer to carry out the resolution.
10599///
10600/// The sequence is:
10601/// adrp x0, :tlsdesc:var
10602/// ldr x1, [x0, #:tlsdesc_lo12:var]
10603/// add x0, x0, #:tlsdesc_lo12:var
10604/// .tlsdesccall var
10605/// blr x1
10606/// (TPIDR_EL0 offset now in x0)
10607///
10608/// The above sequence must be produced unscheduled, to enable the linker to
10609/// optimize/relax this sequence.
10610/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10611/// above sequence, and expanded really late in the compilation flow, to ensure
10612/// the sequence is produced as per above.
10613SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10614 const SDLoc &DL,
10615 SelectionDAG &DAG) const {
10616 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10617
10618 SDValue Chain = DAG.getEntryNode();
10619 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10620
10621 unsigned Opcode =
10622 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10623 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10624 : AArch64ISD::TLSDESC_CALLSEQ;
10625 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10626 SDValue Glue = Chain.getValue(1);
10627
10628 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10629}
10630
10631SDValue
10632AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10633 SelectionDAG &DAG) const {
10634 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10635
10636 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10637 AArch64FunctionInfo *MFI =
10638 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10639
10643
10645 if (Model == TLSModel::LocalDynamic)
10647 }
10648
10650 Model != TLSModel::LocalExec)
10651 report_fatal_error("ELF TLS only supported in small memory model or "
10652 "in local exec TLS model");
10653 // Different choices can be made for the maximum size of the TLS area for a
10654 // module. For the small address model, the default TLS size is 16MiB and the
10655 // maximum TLS size is 4GiB.
10656 // FIXME: add tiny and large code model support for TLS access models other
10657 // than local exec. We currently generate the same code as small for tiny,
10658 // which may be larger than needed.
10659
10660 SDValue TPOff;
10661 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10662 SDLoc DL(Op);
10663 const GlobalValue *GV = GA->getGlobal();
10664
10665 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10666
10667 if (Model == TLSModel::LocalExec) {
10668 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10669 } else if (Model == TLSModel::InitialExec) {
10670 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10671 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10672 } else if (Model == TLSModel::LocalDynamic) {
10673 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10674 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10675 // the beginning of the module's TLS region, followed by a DTPREL offset
10676 // calculation.
10677
10678 // These accesses will need deduplicating if there's more than one.
10680
10681 // The call needs a relocation too for linker relaxation. It doesn't make
10682 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10683 // the address.
10684 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10686
10687 // Now we can calculate the offset from TPIDR_EL0 to this module's
10688 // thread-local area.
10689 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10690
10691 // Now use :dtprel_whatever: operations to calculate this variable's offset
10692 // in its thread-storage area.
10693 SDValue HiVar = DAG.getTargetGlobalAddress(
10694 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10695 SDValue LoVar = DAG.getTargetGlobalAddress(
10696 GV, DL, MVT::i64, 0,
10698
10699 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10700 DAG.getTargetConstant(0, DL, MVT::i32)),
10701 0);
10702 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10703 DAG.getTargetConstant(0, DL, MVT::i32)),
10704 0);
10705 } else if (Model == TLSModel::GeneralDynamic) {
10706 // The call needs a relocation too for linker relaxation. It doesn't make
10707 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10708 // the address.
10709 SDValue SymAddr =
10710 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10711
10712 // Finally we can make a call to calculate the offset from tpidr_el0.
10713 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10714 } else
10715 llvm_unreachable("Unsupported ELF TLS access model");
10716
10717 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10718}
10719
10720SDValue
10721AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10722 SelectionDAG &DAG) const {
10723 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10724
10725 SDValue Chain = DAG.getEntryNode();
10726 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10727 SDLoc DL(Op);
10728
10729 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10730
10731 // Load the ThreadLocalStoragePointer from the TEB
10732 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10733 SDValue TLSArray =
10734 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10735 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10736 Chain = TLSArray.getValue(1);
10737
10738 // Load the TLS index from the C runtime;
10739 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10740 // This also does the same as LOADgot, but using a generic i32 load,
10741 // while LOADgot only loads i64.
10742 SDValue TLSIndexHi =
10743 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10744 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10745 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10746 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10747 SDValue TLSIndex =
10748 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10749 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10750 Chain = TLSIndex.getValue(1);
10751
10752 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10753 // offset into the TLSArray.
10754 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10755 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10756 DAG.getConstant(3, DL, PtrVT));
10757 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10758 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10759 MachinePointerInfo());
10760 Chain = TLS.getValue(1);
10761
10762 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10763 const GlobalValue *GV = GA->getGlobal();
10764 SDValue TGAHi = DAG.getTargetGlobalAddress(
10765 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10766 SDValue TGALo = DAG.getTargetGlobalAddress(
10767 GV, DL, PtrVT, 0,
10769
10770 // Add the offset from the start of the .tls section (section base).
10771 SDValue Addr =
10772 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10773 DAG.getTargetConstant(0, DL, MVT::i32)),
10774 0);
10775 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10776 return Addr;
10777}
10778
10779SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10780 SelectionDAG &DAG) const {
10781 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10782 if (DAG.getTarget().useEmulatedTLS())
10783 return LowerToTLSEmulatedModel(GA, DAG);
10784
10785 if (Subtarget->isTargetDarwin())
10786 return LowerDarwinGlobalTLSAddress(Op, DAG);
10787 if (Subtarget->isTargetELF())
10788 return LowerELFGlobalTLSAddress(Op, DAG);
10789 if (Subtarget->isTargetWindows())
10790 return LowerWindowsGlobalTLSAddress(Op, DAG);
10791
10792 llvm_unreachable("Unexpected platform trying to use TLS");
10793}
10794
10795//===----------------------------------------------------------------------===//
10796// PtrAuthGlobalAddress lowering
10797//
10798// We have 3 lowering alternatives to choose from:
10799// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10800// If the GV doesn't need a GOT load (i.e., is locally defined)
10801// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10802//
10803// - LOADgotPAC: similar to LOADgot, with added PAC.
10804// If the GV needs a GOT load, materialize the pointer using the usual
10805// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10806// section is assumed to be read-only (for example, via relro mechanism). See
10807// LowerMOVaddrPAC.
10808//
10809// - LOADauthptrstatic: similar to LOADgot, but use a
10810// special stub slot instead of a GOT slot.
10811// Load a signed pointer for symbol 'sym' from a stub slot named
10812// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10813// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10814// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10815//
10816// All 3 are pseudos that are expand late to longer sequences: this lets us
10817// provide integrity guarantees on the to-be-signed intermediate values.
10818//
10819// LOADauthptrstatic is undesirable because it requires a large section filled
10820// with often similarly-signed pointers, making it a good harvesting target.
10821// Thus, it's only used for ptrauth references to extern_weak to avoid null
10822// checks.
10823
10825 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10826 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10827 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10828 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10829
10830 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10831 // offset alone as a pointer if the symbol wasn't available, which would
10832 // probably break null checks in users. Ptrauth complicates things further:
10833 // error out.
10834 if (TGN->getOffset() != 0)
10836 "unsupported non-zero offset in weak ptrauth global reference");
10837
10838 if (!isNullConstant(AddrDiscriminator))
10839 report_fatal_error("unsupported weak addr-div ptrauth global");
10840
10841 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10842 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10843 {TGA, Key, Discriminator}),
10844 0);
10845}
10846
10847SDValue
10848AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10849 SelectionDAG &DAG) const {
10850 SDValue Ptr = Op.getOperand(0);
10851 uint64_t KeyC = Op.getConstantOperandVal(1);
10852 SDValue AddrDiscriminator = Op.getOperand(2);
10853 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10854 EVT VT = Op.getValueType();
10855 SDLoc DL(Op);
10856
10857 if (KeyC > AArch64PACKey::LAST)
10858 report_fatal_error("key in ptrauth global out of range [0, " +
10859 Twine((int)AArch64PACKey::LAST) + "]");
10860
10861 // Blend only works if the integer discriminator is 16-bit wide.
10862 if (!isUInt<16>(DiscriminatorC))
10864 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10865
10866 // Choosing between 3 lowering alternatives is target-specific.
10867 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10868 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10869
10870 int64_t PtrOffsetC = 0;
10871 if (Ptr.getOpcode() == ISD::ADD) {
10872 PtrOffsetC = Ptr.getConstantOperandVal(1);
10873 Ptr = Ptr.getOperand(0);
10874 }
10875 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10876 const GlobalValue *PtrGV = PtrN->getGlobal();
10877
10878 // Classify the reference to determine whether it needs a GOT load.
10879 const unsigned OpFlags =
10880 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10881 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10882 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10883 "unsupported non-GOT op flags on ptrauth global reference");
10884
10885 // Fold any offset into the GV; our pseudos expect it there.
10886 PtrOffsetC += PtrN->getOffset();
10887 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10888 /*TargetFlags=*/0);
10889 assert(PtrN->getTargetFlags() == 0 &&
10890 "unsupported target flags on ptrauth global");
10891
10892 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10893 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10894 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10895 ? AddrDiscriminator
10896 : DAG.getRegister(AArch64::XZR, MVT::i64);
10897
10898 // No GOT load needed -> MOVaddrPAC
10899 if (!NeedsGOTLoad) {
10900 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10901 return SDValue(
10902 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10903 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10904 0);
10905 }
10906
10907 // GOT load -> LOADgotPAC
10908 // Note that we disallow extern_weak refs to avoid null checks later.
10909 if (!PtrGV->hasExternalWeakLinkage())
10910 return SDValue(
10911 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10912 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10913 0);
10914
10915 // extern_weak ref -> LOADauthptrstatic
10917 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10918 DAG);
10919}
10920
10921// Looks through \param Val to determine the bit that can be used to
10922// check the sign of the value. It returns the unextended value and
10923// the sign bit position.
10924std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10925 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10926 return {Val.getOperand(0),
10927 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10928 1};
10929
10930 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10931 return {Val.getOperand(0),
10932 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10933
10934 return {Val, Val.getValueSizeInBits() - 1};
10935}
10936
10937SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10938 SDValue Chain = Op.getOperand(0);
10939 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10940 SDValue LHS = Op.getOperand(2);
10941 SDValue RHS = Op.getOperand(3);
10942 SDValue Dest = Op.getOperand(4);
10943 SDLoc DL(Op);
10944
10945 MachineFunction &MF = DAG.getMachineFunction();
10946 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10947 // will not be produced, as they are conditional branch instructions that do
10948 // not set flags.
10949 bool ProduceNonFlagSettingCondBr =
10950 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10951
10952 // Handle f128 first, since lowering it will result in comparing the return
10953 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10954 // is expecting to deal with.
10955 if (LHS.getValueType() == MVT::f128) {
10956 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10957
10958 // If softenSetCCOperands returned a scalar, we need to compare the result
10959 // against zero to select between true and false values.
10960 if (!RHS.getNode()) {
10961 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10962 CC = ISD::SETNE;
10963 }
10964 }
10965
10966 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10967 // instruction.
10969 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10970 // Only lower legal XALUO ops.
10971 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10972 return SDValue();
10973
10974 // The actual operation with overflow check.
10976 SDValue Value, Overflow;
10977 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10978
10979 if (CC == ISD::SETNE)
10980 OFCC = getInvertedCondCode(OFCC);
10981 SDValue CCVal = getCondCode(DAG, OFCC);
10982
10983 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10984 Overflow);
10985 }
10986
10987 if (LHS.getValueType().isInteger()) {
10988 assert((LHS.getValueType() == RHS.getValueType()) &&
10989 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10990
10991 // If the RHS of the comparison is zero, we can potentially fold this
10992 // to a specialized branch.
10993 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10994 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10995 if (CC == ISD::SETEQ) {
10996 // See if we can use a TBZ to fold in an AND as well.
10997 // TBZ has a smaller branch displacement than CBZ. If the offset is
10998 // out of bounds, a late MI-layer pass rewrites branches.
10999 // 403.gcc is an example that hits this case.
11000 if (LHS.getOpcode() == ISD::AND &&
11001 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11002 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11003 SDValue Test = LHS.getOperand(0);
11004 uint64_t Mask = LHS.getConstantOperandVal(1);
11005 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
11006 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11007 Dest);
11008 }
11009
11010 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11011 } else if (CC == ISD::SETNE) {
11012 // See if we can use a TBZ to fold in an AND as well.
11013 // TBZ has a smaller branch displacement than CBZ. If the offset is
11014 // out of bounds, a late MI-layer pass rewrites branches.
11015 // 403.gcc is an example that hits this case.
11016 if (LHS.getOpcode() == ISD::AND &&
11017 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11018 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11019 SDValue Test = LHS.getOperand(0);
11020 uint64_t Mask = LHS.getConstantOperandVal(1);
11021 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
11022 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11023 Dest);
11024 }
11025
11026 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11027 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11028 // Don't combine AND since emitComparison converts the AND to an ANDS
11029 // (a.k.a. TST) and the test in the test bit and branch instruction
11030 // becomes redundant. This would also increase register pressure.
11031 uint64_t SignBitPos;
11032 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11033 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11034 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11035 }
11036 }
11037 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11038 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11039 // Don't combine AND since emitComparison converts the AND to an ANDS
11040 // (a.k.a. TST) and the test in the test bit and branch instruction
11041 // becomes redundant. This would also increase register pressure.
11042 uint64_t SignBitPos;
11043 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11044 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11045 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11046 }
11047
11048 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11049 // larger branch displacement but do prefer CB over cmp + br.
11050 if (Subtarget->hasCMPBR() &&
11052 ProduceNonFlagSettingCondBr) {
11053 SDValue Cond =
11055 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11056 Dest);
11057 }
11058
11059 SDValue CCVal;
11060 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11061 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11062 Cmp);
11063 }
11064
11065 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11066 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11067
11068 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11069 // clean. Some of them require two branches to implement.
11070 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11071 AArch64CC::CondCode CC1, CC2;
11072 changeFPCCToAArch64CC(CC, CC1, CC2);
11073 SDValue CC1Val = getCondCode(DAG, CC1);
11074 SDValue BR1 =
11075 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11076 if (CC2 != AArch64CC::AL) {
11077 SDValue CC2Val = getCondCode(DAG, CC2);
11078 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11079 Cmp);
11080 }
11081
11082 return BR1;
11083}
11084
11085SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11086 SelectionDAG &DAG) const {
11087 if (!Subtarget->isNeonAvailable() &&
11088 !Subtarget->useSVEForFixedLengthVectors())
11089 return SDValue();
11090
11091 EVT VT = Op.getValueType();
11092 EVT IntVT = VT.changeTypeToInteger();
11093 SDLoc DL(Op);
11094
11095 SDValue In1 = Op.getOperand(0);
11096 SDValue In2 = Op.getOperand(1);
11097 EVT SrcVT = In2.getValueType();
11098
11099 if (!SrcVT.bitsEq(VT))
11100 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11101
11102 if (VT.isScalableVector())
11103 IntVT =
11105
11106 if (VT.isFixedLengthVector() &&
11107 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11108 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11109
11110 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11111 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11112
11113 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11114 return convertFromScalableVector(DAG, VT, Res);
11115 }
11116
11117 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11118 // a SVE FCOPYSIGN.
11119 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11120 Subtarget->isSVEorStreamingSVEAvailable()) {
11121 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11122 return SDValue();
11123 EVT SVT = getPackedSVEVectorVT(VT);
11124
11125 SDValue Ins1 =
11126 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11127 DAG.getConstant(0, DL, MVT::i64));
11128 SDValue Ins2 =
11129 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11130 DAG.getConstant(0, DL, MVT::i64));
11131 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11132 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11133 DAG.getConstant(0, DL, MVT::i64));
11134 }
11135
11136 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11137 if (VT.isScalableVector())
11138 return getSVESafeBitCast(VT, Op, DAG);
11139
11140 return DAG.getBitcast(VT, Op);
11141 };
11142
11143 SDValue VecVal1, VecVal2;
11144 EVT VecVT;
11145 auto SetVecVal = [&](int Idx = -1) {
11146 if (!VT.isVector()) {
11147 VecVal1 =
11148 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11149 VecVal2 =
11150 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11151 } else {
11152 VecVal1 = BitCast(VecVT, In1, DAG);
11153 VecVal2 = BitCast(VecVT, In2, DAG);
11154 }
11155 };
11156 if (VT.isVector()) {
11157 VecVT = IntVT;
11158 SetVecVal();
11159 } else if (VT == MVT::f64) {
11160 VecVT = MVT::v2i64;
11161 SetVecVal(AArch64::dsub);
11162 } else if (VT == MVT::f32) {
11163 VecVT = MVT::v4i32;
11164 SetVecVal(AArch64::ssub);
11165 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11166 VecVT = MVT::v8i16;
11167 SetVecVal(AArch64::hsub);
11168 } else {
11169 llvm_unreachable("Invalid type for copysign!");
11170 }
11171
11172 unsigned BitWidth = In1.getScalarValueSizeInBits();
11173 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11174
11175 // We want to materialize a mask with every bit but the high bit set, but the
11176 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11177 // 64-bit elements. Instead, materialize all bits set and then negate that.
11178 if (VT == MVT::f64 || VT == MVT::v2f64) {
11179 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11180 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11181 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11182 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11183 }
11184
11185 SDValue BSP =
11186 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11187 if (VT == MVT::f16 || VT == MVT::bf16)
11188 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11189 if (VT == MVT::f32)
11190 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11191 if (VT == MVT::f64)
11192 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11193
11194 return BitCast(VT, BSP, DAG);
11195}
11196
11197SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11198 SelectionDAG &DAG) const {
11200 Attribute::NoImplicitFloat))
11201 return SDValue();
11202
11203 EVT VT = Op.getValueType();
11204 if (VT.isScalableVector() ||
11205 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11206 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11207
11208 bool IsParity = Op.getOpcode() == ISD::PARITY;
11209 SDValue Val = Op.getOperand(0);
11210 SDLoc DL(Op);
11211
11212 // for i32, general parity function using EORs is more efficient compared to
11213 // using floating point
11214 if (VT == MVT::i32 && IsParity)
11215 return SDValue();
11216
11217 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11218 if (VT == MVT::i32 || VT == MVT::i64) {
11219 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11220 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11221 DAG.getUNDEF(ContainerVT), Val,
11222 DAG.getVectorIdxConstant(0, DL));
11223 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11224 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11225 DAG.getVectorIdxConstant(0, DL));
11226 if (IsParity)
11227 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11228 return Val;
11229 }
11230
11231 if (VT == MVT::i128) {
11232 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11233 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11234 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11235 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11236 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11237 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11238 if (IsParity)
11239 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11240 return Val;
11241 }
11242 }
11243
11244 if (!Subtarget->isNeonAvailable())
11245 return SDValue();
11246
11247 // If there is no CNT instruction available, GPR popcount can
11248 // be more efficiently lowered to the following sequence that uses
11249 // AdvSIMD registers/instructions as long as the copies to/from
11250 // the AdvSIMD registers are cheap.
11251 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11252 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11253 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11254 // FMOV X0, D0 // copy result back to integer reg
11255 if (VT == MVT::i32 || VT == MVT::i64) {
11256 if (VT == MVT::i32)
11257 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11258 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11259
11260 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11261 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11262 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11263 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11264 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11265 DAG.getConstant(0, DL, MVT::i64));
11266 if (IsParity)
11267 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11268 return AddV;
11269 } else if (VT == MVT::i128) {
11270 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11271
11272 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11273 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11274 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11275 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11276 DAG.getConstant(0, DL, MVT::i64));
11277 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11278 if (IsParity)
11279 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11280 return AddV;
11281 }
11282
11283 assert(!IsParity && "ISD::PARITY of vector types not supported");
11284
11285 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11286 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11287 "Unexpected type for custom ctpop lowering");
11288
11289 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11290 Val = DAG.getBitcast(VT8Bit, Val);
11291 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11292
11293 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11294 VT.getVectorNumElements() >= 2) {
11295 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11296 SDValue Zeros = DAG.getConstant(0, DL, DT);
11297 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11298
11299 if (VT == MVT::v2i64) {
11300 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11301 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11302 } else if (VT == MVT::v2i32) {
11303 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11304 } else if (VT == MVT::v4i32) {
11305 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11306 } else {
11307 llvm_unreachable("Unexpected type for custom ctpop lowering");
11308 }
11309
11310 return Val;
11311 }
11312
11313 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11314 unsigned EltSize = 8;
11315 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11316 while (EltSize != VT.getScalarSizeInBits()) {
11317 EltSize *= 2;
11318 NumElts /= 2;
11319 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11320 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11321 }
11322
11323 return Val;
11324}
11325
11326SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11327 EVT VT = Op.getValueType();
11328 assert(VT.isScalableVector() ||
11330 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11331
11332 SDLoc DL(Op);
11333 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11334 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11335}
11336
11337SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11338 SelectionDAG &DAG) const {
11339
11340 EVT VT = Op.getValueType();
11341 SDLoc DL(Op);
11342 unsigned Opcode = Op.getOpcode();
11343 ISD::CondCode CC;
11344 switch (Opcode) {
11345 default:
11346 llvm_unreachable("Wrong instruction");
11347 case ISD::SMAX:
11348 CC = ISD::SETGT;
11349 break;
11350 case ISD::SMIN:
11351 CC = ISD::SETLT;
11352 break;
11353 case ISD::UMAX:
11354 CC = ISD::SETUGT;
11355 break;
11356 case ISD::UMIN:
11357 CC = ISD::SETULT;
11358 break;
11359 }
11360
11361 if (VT.isScalableVector() ||
11363 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11364 switch (Opcode) {
11365 default:
11366 llvm_unreachable("Wrong instruction");
11367 case ISD::SMAX:
11368 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11369 case ISD::SMIN:
11370 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11371 case ISD::UMAX:
11372 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11373 case ISD::UMIN:
11374 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11375 }
11376 }
11377
11378 SDValue Op0 = Op.getOperand(0);
11379 SDValue Op1 = Op.getOperand(1);
11380 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11381 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11382}
11383
11384SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11385 SelectionDAG &DAG) const {
11386 EVT VT = Op.getValueType();
11387
11388 if (VT.isScalableVector() ||
11390 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11391 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11392
11393 SDLoc DL(Op);
11394 SDValue REVB;
11395 MVT VST;
11396
11397 switch (VT.getSimpleVT().SimpleTy) {
11398 default:
11399 llvm_unreachable("Invalid type for bitreverse!");
11400
11401 case MVT::v2i32: {
11402 VST = MVT::v8i8;
11403 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11404
11405 break;
11406 }
11407
11408 case MVT::v4i32: {
11409 VST = MVT::v16i8;
11410 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11411
11412 break;
11413 }
11414
11415 case MVT::v1i64: {
11416 VST = MVT::v8i8;
11417 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11418
11419 break;
11420 }
11421
11422 case MVT::v2i64: {
11423 VST = MVT::v16i8;
11424 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11425
11426 break;
11427 }
11428 }
11429
11430 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11431 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11432}
11433
11434// Check whether the continuous comparison sequence.
11435static bool
11436isOrXorChain(SDValue N, unsigned &Num,
11437 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11438 if (Num == MaxXors)
11439 return false;
11440
11441 // Skip the one-use zext
11442 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11443 N = N->getOperand(0);
11444
11445 // The leaf node must be XOR
11446 if (N->getOpcode() == ISD::XOR) {
11447 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11448 Num++;
11449 return true;
11450 }
11451
11452 // All the non-leaf nodes must be OR.
11453 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11454 return false;
11455
11456 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11457 isOrXorChain(N->getOperand(1), Num, WorkList))
11458 return true;
11459 return false;
11460}
11461
11462// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11464 SDValue LHS = N->getOperand(0);
11465 SDValue RHS = N->getOperand(1);
11466 SDLoc DL(N);
11467 EVT VT = N->getValueType(0);
11469
11470 // Only handle integer compares.
11471 if (N->getOpcode() != ISD::SETCC)
11472 return SDValue();
11473
11474 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11475 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11476 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11477 unsigned NumXors = 0;
11478 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11479 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11480 isOrXorChain(LHS, NumXors, WorkList)) {
11481 SDValue XOR0, XOR1;
11482 std::tie(XOR0, XOR1) = WorkList[0];
11483 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11484 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11485 for (unsigned I = 1; I < WorkList.size(); I++) {
11486 std::tie(XOR0, XOR1) = WorkList[I];
11487 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11488 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11489 }
11490
11491 // Exit early by inverting the condition, which help reduce indentations.
11492 return Cmp;
11493 }
11494
11495 return SDValue();
11496}
11497
11498SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11499
11500 if (Op.getValueType().isVector())
11501 return LowerVSETCC(Op, DAG);
11502
11503 bool IsStrict = Op->isStrictFPOpcode();
11504 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11505 unsigned OpNo = IsStrict ? 1 : 0;
11506 SDValue Chain;
11507 if (IsStrict)
11508 Chain = Op.getOperand(0);
11509 SDValue LHS = Op.getOperand(OpNo + 0);
11510 SDValue RHS = Op.getOperand(OpNo + 1);
11511 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11512 SDLoc DL(Op);
11513
11514 // We chose ZeroOrOneBooleanContents, so use zero and one.
11515 EVT VT = Op.getValueType();
11516 SDValue TVal = DAG.getConstant(1, DL, VT);
11517 SDValue FVal = DAG.getConstant(0, DL, VT);
11518
11519 // Handle f128 first, since one possible outcome is a normal integer
11520 // comparison which gets picked up by the next if statement.
11521 if (LHS.getValueType() == MVT::f128) {
11522 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11523 IsSignaling);
11524
11525 // If softenSetCCOperands returned a scalar, use it.
11526 if (!RHS.getNode()) {
11527 assert(LHS.getValueType() == Op.getValueType() &&
11528 "Unexpected setcc expansion!");
11529 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11530 }
11531 }
11532
11533 if (LHS.getValueType().isInteger()) {
11534
11535 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11536
11537 SDValue CCVal;
11539 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11540
11541 // Note that we inverted the condition above, so we reverse the order of
11542 // the true and false operands here. This will allow the setcc to be
11543 // matched to a single CSINC instruction.
11544 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11545 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11546 }
11547
11548 // Now we know we're dealing with FP values.
11549 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11550 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11551
11552 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11553 // and do the comparison.
11554 SDValue Cmp;
11555 if (IsStrict)
11556 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11557 else
11558 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11559
11560 AArch64CC::CondCode CC1, CC2;
11561 changeFPCCToAArch64CC(CC, CC1, CC2);
11562 SDValue Res;
11563 if (CC2 == AArch64CC::AL) {
11564 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11565 CC2);
11566 SDValue CC1Val = getCondCode(DAG, CC1);
11567
11568 // Note that we inverted the condition above, so we reverse the order of
11569 // the true and false operands here. This will allow the setcc to be
11570 // matched to a single CSINC instruction.
11571 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11572 } else {
11573 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11574 // totally clean. Some of them require two CSELs to implement. As is in
11575 // this case, we emit the first CSEL and then emit a second using the output
11576 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11577
11578 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11579 SDValue CC1Val = getCondCode(DAG, CC1);
11580 SDValue CS1 =
11581 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11582
11583 SDValue CC2Val = getCondCode(DAG, CC2);
11584 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11585 }
11586 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11587}
11588
11589SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11590 SelectionDAG &DAG) const {
11591
11592 SDValue LHS = Op.getOperand(0);
11593 SDValue RHS = Op.getOperand(1);
11594 EVT VT = LHS.getValueType();
11595 if (VT != MVT::i32 && VT != MVT::i64)
11596 return SDValue();
11597
11598 SDLoc DL(Op);
11599 SDValue Carry = Op.getOperand(2);
11600 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11601 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11602 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11603 LHS, RHS, InvCarry);
11604
11605 EVT OpVT = Op.getValueType();
11606 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11607 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11608
11609 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11611 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11612 // Inputs are swapped because the condition is inverted. This will allow
11613 // matching with a single CSINC instruction.
11614 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11615 Cmp.getValue(1));
11616}
11617
11618/// Emit vector comparison for floating-point values, producing a mask.
11620 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11621 const SDLoc &DL, SelectionDAG &DAG) {
11622 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11623 "function only supposed to emit natural comparisons");
11624
11625 switch (CC) {
11626 default:
11627 return SDValue();
11628 case AArch64CC::NE: {
11629 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11630 // Use vector semantics for the inversion to potentially save a copy between
11631 // SIMD and regular registers.
11632 if (!LHS.getValueType().isVector()) {
11633 EVT VecVT =
11634 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11635 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11636 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11637 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11638 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11639 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11640 }
11641 return DAG.getNOT(DL, Fcmeq, VT);
11642 }
11643 case AArch64CC::EQ:
11644 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11645 case AArch64CC::GE:
11646 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11647 case AArch64CC::GT:
11648 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11649 case AArch64CC::LE:
11650 if (!NoNans)
11651 return SDValue();
11652 // If we ignore NaNs then we can use to the LS implementation.
11653 [[fallthrough]];
11654 case AArch64CC::LS:
11655 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11656 case AArch64CC::LT:
11657 if (!NoNans)
11658 return SDValue();
11659 // If we ignore NaNs then we can use to the MI implementation.
11660 [[fallthrough]];
11661 case AArch64CC::MI:
11662 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11663 }
11664}
11665
11666/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11667/// values are scalars, try to emit a mask generating vector instruction.
11669 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11670 const SDLoc &DL, SelectionDAG &DAG) {
11671 assert(!LHS.getValueType().isVector());
11672 assert(!RHS.getValueType().isVector());
11673
11674 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11675 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11676 if (!CTVal || !CFVal)
11677 return {};
11678 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11679 !(CTVal->isZero() && CFVal->isAllOnes()))
11680 return {};
11681
11682 if (CTVal->isZero())
11683 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11684
11685 EVT VT = TVal.getValueType();
11686 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11687 return {};
11688
11689 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11690 bool OneNaN = false;
11691 if (LHS == RHS) {
11692 OneNaN = true;
11693 } else if (DAG.isKnownNeverNaN(RHS)) {
11694 OneNaN = true;
11695 RHS = LHS;
11696 } else if (DAG.isKnownNeverNaN(LHS)) {
11697 OneNaN = true;
11698 LHS = RHS;
11699 }
11700 if (OneNaN)
11701 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11702 }
11703
11706 bool ShouldInvert = false;
11707 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11708 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11709 SDValue Cmp2;
11710 if (CC2 != AArch64CC::AL) {
11711 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11712 if (!Cmp2)
11713 return {};
11714 }
11715 if (!Cmp2 && !ShouldInvert)
11716 return Cmp;
11717
11718 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11719 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11720 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11721 Zero);
11722 if (Cmp2) {
11723 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11724 Cmp2, Zero);
11725 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11726 }
11727 if (ShouldInvert)
11728 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11729 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11730 return Cmp;
11731}
11732
11733SDValue AArch64TargetLowering::LowerSELECT_CC(
11736 const SDLoc &DL, SelectionDAG &DAG) const {
11737 // Handle f128 first, because it will result in a comparison of some RTLIB
11738 // call result against zero.
11739 if (LHS.getValueType() == MVT::f128) {
11740 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11741
11742 // If softenSetCCOperands returned a scalar, we need to compare the result
11743 // against zero to select between true and false values.
11744 if (!RHS.getNode()) {
11745 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11746 CC = ISD::SETNE;
11747 }
11748 }
11749
11750 // Also handle f16, for which we need to do a f32 comparison.
11751 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11752 LHS.getValueType() == MVT::bf16) {
11753 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11754 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11755 }
11756
11757 // Next, handle integers.
11758 if (LHS.getValueType().isInteger()) {
11759 assert((LHS.getValueType() == RHS.getValueType()) &&
11760 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11761
11762 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11763 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11764 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11765
11766 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11767 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11768 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11769 // Both require less instructions than compare and conditional select.
11770 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11771 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11772 LHS.getValueType() == RHS.getValueType()) {
11773 EVT VT = LHS.getValueType();
11774 SDValue Shift =
11775 DAG.getNode(ISD::SRA, DL, VT, LHS,
11776 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11777
11778 if (CC == ISD::SETGT)
11779 Shift = DAG.getNOT(DL, Shift, VT);
11780
11781 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11782 }
11783
11784 // Check for sign bit test patterns that can use TST optimization.
11785 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
11786 // -> TST %operand, sign_bit; CSEL
11787 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
11788 // -> TST %operand, sign_bit; CSEL
11789 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
11790 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
11791 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11792
11793 uint64_t SignBitPos;
11794 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11795 EVT TestVT = LHS.getValueType();
11796 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
11797 SDValue TST =
11798 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
11799 LHS, SignBitConst);
11800
11801 SDValue Flags = TST.getValue(1);
11802 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
11803 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
11804 }
11805
11806 // Canonicalise absolute difference patterns:
11807 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11808 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11809 //
11810 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11811 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11812 // The second forms can be matched into subs+cneg.
11813 // NOTE: Drop poison generating flags from the negated operand to avoid
11814 // inadvertently propagating poison after the canonicalisation.
11815 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11816 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11817 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11819 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11820 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11821 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11823 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11824 }
11825 }
11826
11827 unsigned Opcode = AArch64ISD::CSEL;
11828
11829 // If both the TVal and the FVal are constants, see if we can swap them in
11830 // order to for a CSINV or CSINC out of them.
11831 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11832 std::swap(TVal, FVal);
11833 std::swap(CTVal, CFVal);
11834 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11835 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11836 std::swap(TVal, FVal);
11837 std::swap(CTVal, CFVal);
11838 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11839 } else if (TVal.getOpcode() == ISD::XOR) {
11840 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11841 // with a CSINV rather than a CSEL.
11842 if (isAllOnesConstant(TVal.getOperand(1))) {
11843 std::swap(TVal, FVal);
11844 std::swap(CTVal, CFVal);
11845 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11846 }
11847 } else if (TVal.getOpcode() == ISD::SUB) {
11848 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11849 // that we can match with a CSNEG rather than a CSEL.
11850 if (isNullConstant(TVal.getOperand(0))) {
11851 std::swap(TVal, FVal);
11852 std::swap(CTVal, CFVal);
11853 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11854 }
11855 } else if (CTVal && CFVal) {
11856 const int64_t TrueVal = CTVal->getSExtValue();
11857 const int64_t FalseVal = CFVal->getSExtValue();
11858 bool Swap = false;
11859
11860 // If both TVal and FVal are constants, see if FVal is the
11861 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11862 // instead of a CSEL in that case.
11863 if (TrueVal == ~FalseVal) {
11864 Opcode = AArch64ISD::CSINV;
11865 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11866 TrueVal == -FalseVal) {
11867 Opcode = AArch64ISD::CSNEG;
11868 } else if (TVal.getValueType() == MVT::i32) {
11869 // If our operands are only 32-bit wide, make sure we use 32-bit
11870 // arithmetic for the check whether we can use CSINC. This ensures that
11871 // the addition in the check will wrap around properly in case there is
11872 // an overflow (which would not be the case if we do the check with
11873 // 64-bit arithmetic).
11874 const uint32_t TrueVal32 = CTVal->getZExtValue();
11875 const uint32_t FalseVal32 = CFVal->getZExtValue();
11876
11877 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11878 Opcode = AArch64ISD::CSINC;
11879
11880 if (TrueVal32 > FalseVal32) {
11881 Swap = true;
11882 }
11883 }
11884 } else {
11885 // 64-bit check whether we can use CSINC.
11886 const uint64_t TrueVal64 = TrueVal;
11887 const uint64_t FalseVal64 = FalseVal;
11888
11889 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11890 Opcode = AArch64ISD::CSINC;
11891
11892 if (TrueVal > FalseVal) {
11893 Swap = true;
11894 }
11895 }
11896 }
11897
11898 // Swap TVal and FVal if necessary.
11899 if (Swap) {
11900 std::swap(TVal, FVal);
11901 std::swap(CTVal, CFVal);
11902 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11903 }
11904
11905 if (Opcode != AArch64ISD::CSEL) {
11906 // Drop FVal since we can get its value by simply inverting/negating
11907 // TVal.
11908 FVal = TVal;
11909 }
11910 }
11911
11912 // Avoid materializing a constant when possible by reusing a known value in
11913 // a register. However, don't perform this optimization if the known value
11914 // is one, zero or negative one in the case of a CSEL. We can always
11915 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11916 // FVal, respectively.
11917 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11918 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11919 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11921 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11922 // "a != C ? x : a" to avoid materializing C.
11923 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11924 TVal = LHS;
11925 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11926 FVal = LHS;
11927 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11928 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11929 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11930 // avoid materializing C.
11932 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11933 Opcode = AArch64ISD::CSINV;
11934 TVal = LHS;
11935 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11936 }
11937 }
11938
11939 SDValue CCVal;
11940 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11941 EVT VT = TVal.getValueType();
11942 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11943 }
11944
11945 // Now we know we're dealing with FP values.
11946 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11947 LHS.getValueType() == MVT::f64);
11948 assert(LHS.getValueType() == RHS.getValueType());
11949 EVT VT = TVal.getValueType();
11950
11951 // If the purpose of the comparison is to select between all ones
11952 // or all zeros, try to use a vector comparison because the operands are
11953 // already stored in SIMD registers.
11954 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11955 switch (U->getOpcode()) {
11956 default:
11957 return false;
11960 case AArch64ISD::DUP:
11961 return true;
11962 }
11963 })) {
11964 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11965 SDValue VectorCmp =
11966 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11967 if (VectorCmp)
11968 return VectorCmp;
11969 }
11970
11971 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11972
11973 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11974 // clean. Some of them require two CSELs to implement.
11975 AArch64CC::CondCode CC1, CC2;
11976 changeFPCCToAArch64CC(CC, CC1, CC2);
11977
11978 if (Flags.hasNoSignedZeros()) {
11979 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11980 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11981 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11982 if (RHSVal && RHSVal->isZero()) {
11983 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11984 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11985
11986 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11987 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11988 TVal = LHS;
11989 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11990 CFVal && CFVal->isZero() &&
11991 FVal.getValueType() == LHS.getValueType())
11992 FVal = LHS;
11993 }
11994 }
11995
11996 // Emit first, and possibly only, CSEL.
11997 SDValue CC1Val = getCondCode(DAG, CC1);
11998 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11999
12000 // If we need a second CSEL, emit it, using the output of the first as the
12001 // RHS. We're effectively OR'ing the two CC's together.
12002 if (CC2 != AArch64CC::AL) {
12003 SDValue CC2Val = getCondCode(DAG, CC2);
12004 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12005 }
12006
12007 // Otherwise, return the output of the first CSEL.
12008 return CS1;
12009}
12010
12011SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12012 SelectionDAG &DAG) const {
12013 EVT Ty = Op.getValueType();
12014 auto Idx = Op.getConstantOperandAPInt(2);
12015 int64_t IdxVal = Idx.getSExtValue();
12016 assert(Ty.isScalableVector() &&
12017 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12018
12019 // We can use the splice instruction for certain index values where we are
12020 // able to efficiently generate the correct predicate. The index will be
12021 // inverted and used directly as the input to the ptrue instruction, i.e.
12022 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12023 // splice predicate. However, we can only do this if we can guarantee that
12024 // there are enough elements in the vector, hence we check the index <= min
12025 // number of elements.
12026 std::optional<unsigned> PredPattern;
12027 if (Ty.isScalableVector() && IdxVal < 0 &&
12028 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
12029 std::nullopt) {
12030 SDLoc DL(Op);
12031
12032 // Create a predicate where all but the last -IdxVal elements are false.
12033 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
12034 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12035 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12036
12037 // Now splice the two inputs together using the predicate.
12038 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12039 Op.getOperand(1));
12040 }
12041
12042 // We can select to an EXT instruction when indexing the first 256 bytes.
12044 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
12045 return Op;
12046
12047 return SDValue();
12048}
12049
12050SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12051 SelectionDAG &DAG) const {
12052 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12053 SDValue LHS = Op.getOperand(0);
12054 SDValue RHS = Op.getOperand(1);
12055 SDValue TVal = Op.getOperand(2);
12056 SDValue FVal = Op.getOperand(3);
12057 SDNodeFlags Flags = Op->getFlags();
12058 SDLoc DL(Op);
12059 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12060}
12061
12062SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12063 SelectionDAG &DAG) const {
12064 SDValue CCVal = Op->getOperand(0);
12065 SDValue TVal = Op->getOperand(1);
12066 SDValue FVal = Op->getOperand(2);
12067 SDLoc DL(Op);
12068
12069 EVT Ty = Op.getValueType();
12070 if (Ty == MVT::aarch64svcount) {
12071 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12072 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12073 SDValue Sel =
12074 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12075 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12076 }
12077
12078 if (Ty.isScalableVector()) {
12079 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12080 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12081 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12082 }
12083
12084 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12085 // FIXME: Ideally this would be the same as above using i1 types, however
12086 // for the moment we can't deal with fixed i1 vector types properly, so
12087 // instead extend the predicate to a result type sized integer vector.
12088 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12089 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12090 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12091 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12092 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12093 }
12094
12095 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12096 // instruction.
12097 if (ISD::isOverflowIntrOpRes(CCVal)) {
12098 // Only lower legal XALUO ops.
12099 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12100 return SDValue();
12101
12103 SDValue Value, Overflow;
12104 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12105 SDValue CCVal = getCondCode(DAG, OFCC);
12106
12107 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12108 CCVal, Overflow);
12109 }
12110
12111 // Lower it the same way as we would lower a SELECT_CC node.
12112 ISD::CondCode CC;
12113 SDValue LHS, RHS;
12114 if (CCVal.getOpcode() == ISD::SETCC) {
12115 LHS = CCVal.getOperand(0);
12116 RHS = CCVal.getOperand(1);
12117 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12118 } else {
12119 LHS = CCVal;
12120 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12121 CC = ISD::SETNE;
12122 }
12123
12124 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12125 // order to use FCSELSrrr
12126 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12127 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12128 DAG.getUNDEF(MVT::f32), TVal);
12129 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12130 DAG.getUNDEF(MVT::f32), FVal);
12131 }
12132
12133 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12134 Op->getFlags(), DL, DAG);
12135
12136 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12137 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12138 }
12139
12140 return Res;
12141}
12142
12143SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12144 SelectionDAG &DAG) const {
12145 // Jump table entries as PC relative offsets. No additional tweaking
12146 // is necessary here. Just get the address of the jump table.
12147 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12148
12151 !Subtarget->isTargetMachO())
12152 return getAddrLarge(JT, DAG);
12153 if (CM == CodeModel::Tiny)
12154 return getAddrTiny(JT, DAG);
12155 return getAddr(JT, DAG);
12156}
12157
12158SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12159 SelectionDAG &DAG) const {
12160 // Jump table entries as PC relative offsets. No additional tweaking
12161 // is necessary here. Just get the address of the jump table.
12162 SDLoc DL(Op);
12163 SDValue JT = Op.getOperand(1);
12164 SDValue Entry = Op.getOperand(2);
12165 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12166
12167 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12168 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12169
12170 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12171 // sequence later, to guarantee the integrity of the intermediate values.
12173 "aarch64-jump-table-hardening")) {
12175 if (Subtarget->isTargetMachO()) {
12176 if (CM != CodeModel::Small && CM != CodeModel::Large)
12177 report_fatal_error("Unsupported code-model for hardened jump-table");
12178 } else {
12179 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12180 assert(Subtarget->isTargetELF() &&
12181 "jump table hardening only supported on MachO/ELF");
12182 if (CM != CodeModel::Small)
12183 report_fatal_error("Unsupported code-model for hardened jump-table");
12184 }
12185
12186 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12187 Entry, SDValue());
12188 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12189 DAG.getTargetJumpTable(JTI, MVT::i32),
12190 X16Copy.getValue(0), X16Copy.getValue(1));
12191 return SDValue(B, 0);
12192 }
12193
12194 SDNode *Dest =
12195 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12196 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12197 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12198 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12199}
12200
12201SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12202 SDValue Chain = Op.getOperand(0);
12203 SDValue Dest = Op.getOperand(1);
12204
12205 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12206 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12207 if (Dest->isMachineOpcode() &&
12208 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12209 return SDValue();
12210
12211 const MachineFunction &MF = DAG.getMachineFunction();
12212 std::optional<uint16_t> BADisc =
12213 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12214 if (!BADisc)
12215 return SDValue();
12216
12217 SDLoc DL(Op);
12218
12219 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12221 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12222
12223 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12224 {Dest, Key, Disc, AddrDisc, Chain});
12225 return SDValue(BrA, 0);
12226}
12227
12228SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12229 SelectionDAG &DAG) const {
12230 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12232 if (CM == CodeModel::Large) {
12233 // Use the GOT for the large code model on iOS.
12234 if (Subtarget->isTargetMachO()) {
12235 return getGOT(CP, DAG);
12236 }
12238 return getAddrLarge(CP, DAG);
12239 } else if (CM == CodeModel::Tiny) {
12240 return getAddrTiny(CP, DAG);
12241 }
12242 return getAddr(CP, DAG);
12243}
12244
12245SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12246 SelectionDAG &DAG) const {
12247 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12248 const BlockAddress *BA = BAN->getBlockAddress();
12249
12250 if (std::optional<uint16_t> BADisc =
12251 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12252 *BA->getFunction())) {
12253 SDLoc DL(Op);
12254
12255 // This isn't cheap, but BRIND is rare.
12256 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12257
12258 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12259
12261 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12262
12263 SDNode *MOV =
12264 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12265 {TargetBA, Key, AddrDisc, Disc});
12266 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12267 SDValue(MOV, 1));
12268 }
12269
12271 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12273 return getAddrLarge(BAN, DAG);
12274 } else if (CM == CodeModel::Tiny) {
12275 return getAddrTiny(BAN, DAG);
12276 }
12277 return getAddr(BAN, DAG);
12278}
12279
12280SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12281 SelectionDAG &DAG) const {
12282 AArch64FunctionInfo *FuncInfo =
12283 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12284
12285 SDLoc DL(Op);
12286 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12288 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12289 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12290 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12291 MachinePointerInfo(SV));
12292}
12293
12294SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12295 SelectionDAG &DAG) const {
12296 MachineFunction &MF = DAG.getMachineFunction();
12297 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12298
12299 SDLoc DL(Op);
12300 SDValue FR;
12301 if (Subtarget->isWindowsArm64EC()) {
12302 // With the Arm64EC ABI, we compute the address of the varargs save area
12303 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12304 // but calls from an entry thunk can pass in a different address.
12305 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12306 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12307 uint64_t StackOffset;
12308 if (FuncInfo->getVarArgsGPRSize() > 0)
12309 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12310 else
12311 StackOffset = FuncInfo->getVarArgsStackOffset();
12312 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12313 DAG.getConstant(StackOffset, DL, MVT::i64));
12314 } else {
12315 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12316 ? FuncInfo->getVarArgsGPRIndex()
12317 : FuncInfo->getVarArgsStackIndex(),
12319 }
12320 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12321 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12322 MachinePointerInfo(SV));
12323}
12324
12325SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12326 SelectionDAG &DAG) const {
12327 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12328 // Standard, section B.3.
12329 MachineFunction &MF = DAG.getMachineFunction();
12330 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12331 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12332 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12333 auto PtrVT = getPointerTy(DAG.getDataLayout());
12334 SDLoc DL(Op);
12335
12336 SDValue Chain = Op.getOperand(0);
12337 SDValue VAList = Op.getOperand(1);
12338 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12340
12341 // void *__stack at offset 0
12342 unsigned Offset = 0;
12343 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12344 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12345 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12346 MachinePointerInfo(SV), Align(PtrSize)));
12347
12348 // void *__gr_top at offset 8 (4 on ILP32)
12349 Offset += PtrSize;
12350 int GPRSize = FuncInfo->getVarArgsGPRSize();
12351 if (GPRSize > 0) {
12352 SDValue GRTop, GRTopAddr;
12353
12354 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12355 DAG.getConstant(Offset, DL, PtrVT));
12356
12357 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12358 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12359 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12360 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12361
12362 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12363 MachinePointerInfo(SV, Offset),
12364 Align(PtrSize)));
12365 }
12366
12367 // void *__vr_top at offset 16 (8 on ILP32)
12368 Offset += PtrSize;
12369 int FPRSize = FuncInfo->getVarArgsFPRSize();
12370 if (FPRSize > 0) {
12371 SDValue VRTop, VRTopAddr;
12372 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12373 DAG.getConstant(Offset, DL, PtrVT));
12374
12375 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12376 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12377 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12378 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12379
12380 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12381 MachinePointerInfo(SV, Offset),
12382 Align(PtrSize)));
12383 }
12384
12385 // int __gr_offs at offset 24 (12 on ILP32)
12386 Offset += PtrSize;
12387 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12388 DAG.getConstant(Offset, DL, PtrVT));
12389 MemOps.push_back(
12390 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12391 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12392
12393 // int __vr_offs at offset 28 (16 on ILP32)
12394 Offset += 4;
12395 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12396 DAG.getConstant(Offset, DL, PtrVT));
12397 MemOps.push_back(
12398 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12399 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12400
12401 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12402}
12403
12404SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12405 SelectionDAG &DAG) const {
12406 MachineFunction &MF = DAG.getMachineFunction();
12407 Function &F = MF.getFunction();
12408
12409 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12410 return LowerWin64_VASTART(Op, DAG);
12411 else if (Subtarget->isTargetDarwin())
12412 return LowerDarwin_VASTART(Op, DAG);
12413 else
12414 return LowerAAPCS_VASTART(Op, DAG);
12415}
12416
12417SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12418 SelectionDAG &DAG) const {
12419 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12420 // pointer.
12421 SDLoc DL(Op);
12422 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12423 unsigned VaListSize =
12424 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12425 ? PtrSize
12426 : Subtarget->isTargetILP32() ? 20 : 32;
12427 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12428 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12429
12430 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12431 DAG.getConstant(VaListSize, DL, MVT::i32),
12432 Align(PtrSize), false, false, /*CI=*/nullptr,
12433 std::nullopt, MachinePointerInfo(DestSV),
12434 MachinePointerInfo(SrcSV));
12435}
12436
12437SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12438 assert(Subtarget->isTargetDarwin() &&
12439 "automatic va_arg instruction only works on Darwin");
12440
12441 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12442 EVT VT = Op.getValueType();
12443 SDLoc DL(Op);
12444 SDValue Chain = Op.getOperand(0);
12445 SDValue Addr = Op.getOperand(1);
12446 MaybeAlign Align(Op.getConstantOperandVal(3));
12447 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12448 auto PtrVT = getPointerTy(DAG.getDataLayout());
12449 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12450 SDValue VAList =
12451 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12452 Chain = VAList.getValue(1);
12453 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12454
12455 if (VT.isScalableVector())
12456 report_fatal_error("Passing SVE types to variadic functions is "
12457 "currently not supported");
12458
12459 if (Align && *Align > MinSlotSize) {
12460 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12461 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12462 VAList =
12463 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12464 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12465 }
12466
12467 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12468 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12469
12470 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12471 // up to 64 bits. At the very least, we have to increase the striding of the
12472 // vaargs list to match this, and for FP values we need to introduce
12473 // FP_ROUND nodes as well.
12474 if (VT.isInteger() && !VT.isVector())
12475 ArgSize = std::max(ArgSize, MinSlotSize);
12476 bool NeedFPTrunc = false;
12477 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12478 ArgSize = 8;
12479 NeedFPTrunc = true;
12480 }
12481
12482 // Increment the pointer, VAList, to the next vaarg
12483 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12484 DAG.getConstant(ArgSize, DL, PtrVT));
12485 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12486
12487 // Store the incremented VAList to the legalized pointer
12488 SDValue APStore =
12489 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12490
12491 // Load the actual argument out of the pointer VAList
12492 if (NeedFPTrunc) {
12493 // Load the value as an f64.
12494 SDValue WideFP =
12495 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12496 // Round the value down to an f32.
12497 SDValue NarrowFP =
12498 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12499 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12500 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12501 // Merge the rounded value with the chain output of the load.
12502 return DAG.getMergeValues(Ops, DL);
12503 }
12504
12505 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12506}
12507
12508SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12509 SelectionDAG &DAG) const {
12510 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12511 MFI.setFrameAddressIsTaken(true);
12512
12513 EVT VT = Op.getValueType();
12514 SDLoc DL(Op);
12515 unsigned Depth = Op.getConstantOperandVal(0);
12516 SDValue FrameAddr =
12517 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12518 while (Depth--)
12519 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12520 MachinePointerInfo());
12521
12522 if (Subtarget->isTargetILP32())
12523 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12524 DAG.getValueType(VT));
12525
12526 return FrameAddr;
12527}
12528
12529SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12530 SelectionDAG &DAG) const {
12531 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12532
12533 EVT VT = getPointerTy(DAG.getDataLayout());
12534 int FI = MFI.CreateFixedObject(4, 0, false);
12535 return DAG.getFrameIndex(FI, VT);
12536}
12537
12538#define GET_REGISTER_MATCHER
12539#include "AArch64GenAsmMatcher.inc"
12540
12541// FIXME? Maybe this could be a TableGen attribute on some registers and
12542// this table could be generated automatically from RegInfo.
12543Register AArch64TargetLowering::
12544getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12546 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12547 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12548 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12549 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12550 !MRI->isReservedReg(MF, Reg))
12551 Reg = Register();
12552 }
12553 return Reg;
12554}
12555
12556SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12557 SelectionDAG &DAG) const {
12559
12560 EVT VT = Op.getValueType();
12561 SDLoc DL(Op);
12562
12563 SDValue FrameAddr =
12564 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12566
12567 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12568}
12569
12570SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12571 SelectionDAG &DAG) const {
12572 MachineFunction &MF = DAG.getMachineFunction();
12573 MachineFrameInfo &MFI = MF.getFrameInfo();
12574 MFI.setReturnAddressIsTaken(true);
12575
12576 EVT VT = Op.getValueType();
12577 SDLoc DL(Op);
12578 unsigned Depth = Op.getConstantOperandVal(0);
12579 SDValue ReturnAddress;
12580 if (Depth) {
12581 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12583 ReturnAddress = DAG.getLoad(
12584 VT, DL, DAG.getEntryNode(),
12585 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12586 } else {
12587 // Return LR, which contains the return address. Mark it an implicit
12588 // live-in.
12589 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12590 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12591 }
12592
12593 // The XPACLRI instruction assembles to a hint-space instruction before
12594 // Armv8.3-A therefore this instruction can be safely used for any pre
12595 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12596 // that instead.
12597 SDNode *St;
12598 if (Subtarget->hasPAuth()) {
12599 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12600 } else {
12601 // XPACLRI operates on LR therefore we must move the operand accordingly.
12602 SDValue Chain =
12603 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12604 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12605 }
12606 return SDValue(St, 0);
12607}
12608
12609/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12610/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12611SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12612 SelectionDAG &DAG) const {
12613 SDValue Lo, Hi;
12614 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12615 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12616}
12617
12619 const GlobalAddressSDNode *GA) const {
12620 // Offsets are folded in the DAG combine rather than here so that we can
12621 // intelligently choose an offset based on the uses.
12622 return false;
12623}
12624
12626 bool OptForSize) const {
12627 bool IsLegal = false;
12628 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12629 // 16-bit case when target has full fp16 support.
12630 // We encode bf16 bit patterns as if they were fp16. This results in very
12631 // strange looking assembly but should populate the register with appropriate
12632 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12633 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12634 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12635 // FIXME: We should be able to handle f128 as well with a clever lowering.
12636 const APInt ImmInt = Imm.bitcastToAPInt();
12637 if (VT == MVT::f64)
12638 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12639 else if (VT == MVT::f32)
12640 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12641 else if (VT == MVT::f16 || VT == MVT::bf16)
12642 IsLegal =
12643 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12644 Imm.isPosZero();
12645
12646 // If we can not materialize in immediate field for fmov, check if the
12647 // value can be encoded as the immediate operand of a logical instruction.
12648 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12649 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12650 // generate that fmov.
12651 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12652 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12653 // however the mov+fmov sequence is always better because of the reduced
12654 // cache pressure. The timings are still the same if you consider
12655 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12656 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12659 assert(Insn.size() <= 4 &&
12660 "Should be able to build any value with at most 4 moves");
12661 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12662 IsLegal = Insn.size() <= Limit;
12663 }
12664
12665 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12666 << " imm value: "; Imm.dump(););
12667 return IsLegal;
12668}
12669
12670//===----------------------------------------------------------------------===//
12671// AArch64 Optimization Hooks
12672//===----------------------------------------------------------------------===//
12673
12674static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12675 SDValue Operand, SelectionDAG &DAG,
12676 int &ExtraSteps) {
12677 EVT VT = Operand.getValueType();
12678 if ((ST->hasNEON() &&
12679 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12680 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12681 VT == MVT::v4f32)) ||
12682 (ST->hasSVE() &&
12683 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12685 // For the reciprocal estimates, convergence is quadratic, so the number
12686 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12687 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12688 // the result for float (23 mantissa bits) is 2 and for double (52
12689 // mantissa bits) is 3.
12690 constexpr unsigned AccurateBits = 8;
12691 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12692 ExtraSteps = DesiredBits <= AccurateBits
12693 ? 0
12694 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12695 }
12696
12697 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12698 }
12699
12700 return SDValue();
12701}
12702
12703SDValue
12704AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12705 const DenormalMode &Mode) const {
12706 SDLoc DL(Op);
12707 EVT VT = Op.getValueType();
12708 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12709 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12710 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12711}
12712
12713SDValue
12714AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12715 SelectionDAG &DAG) const {
12716 return Op;
12717}
12718
12719SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12720 SelectionDAG &DAG, int Enabled,
12721 int &ExtraSteps,
12722 bool &UseOneConst,
12723 bool Reciprocal) const {
12725 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12726 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12727 DAG, ExtraSteps)) {
12728 SDLoc DL(Operand);
12729 EVT VT = Operand.getValueType();
12730
12731 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12732 SDNodeFlags Flags =
12734
12735 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12736 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12737 for (int i = ExtraSteps; i > 0; --i) {
12738 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12739 Flags);
12740 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12741 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12742 }
12743 if (!Reciprocal)
12744 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12745
12746 ExtraSteps = 0;
12747 return Estimate;
12748 }
12749
12750 return SDValue();
12751}
12752
12753SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12754 SelectionDAG &DAG, int Enabled,
12755 int &ExtraSteps) const {
12757 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12758 DAG, ExtraSteps)) {
12759 SDLoc DL(Operand);
12760 EVT VT = Operand.getValueType();
12761
12763
12764 // Newton reciprocal iteration: E * (2 - X * E)
12765 // AArch64 reciprocal iteration instruction: (2 - M * N)
12766 for (int i = ExtraSteps; i > 0; --i) {
12767 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12768 Estimate, Flags);
12769 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12770 }
12771
12772 ExtraSteps = 0;
12773 return Estimate;
12774 }
12775
12776 return SDValue();
12777}
12778
12779//===----------------------------------------------------------------------===//
12780// AArch64 Inline Assembly Support
12781//===----------------------------------------------------------------------===//
12782
12783// Table of Constraints
12784// TODO: This is the current set of constraints supported by ARM for the
12785// compiler, not all of them may make sense.
12786//
12787// r - A general register
12788// w - An FP/SIMD register of some size in the range v0-v31
12789// x - An FP/SIMD register of some size in the range v0-v15
12790// I - Constant that can be used with an ADD instruction
12791// J - Constant that can be used with a SUB instruction
12792// K - Constant that can be used with a 32-bit logical instruction
12793// L - Constant that can be used with a 64-bit logical instruction
12794// M - Constant that can be used as a 32-bit MOV immediate
12795// N - Constant that can be used as a 64-bit MOV immediate
12796// Q - A memory reference with base register and no offset
12797// S - A symbolic address
12798// Y - Floating point constant zero
12799// Z - Integer constant zero
12800//
12801// Note that general register operands will be output using their 64-bit x
12802// register name, whatever the size of the variable, unless the asm operand
12803// is prefixed by the %w modifier. Floating-point and SIMD register operands
12804// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12805// %q modifier.
12806const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12807 // At this point, we have to lower this constraint to something else, so we
12808 // lower it to an "r" or "w". However, by doing this we will force the result
12809 // to be in register, while the X constraint is much more permissive.
12810 //
12811 // Although we are correct (we are free to emit anything, without
12812 // constraints), we might break use cases that would expect us to be more
12813 // efficient and emit something else.
12814 if (!Subtarget->hasFPARMv8())
12815 return "r";
12816
12817 if (ConstraintVT.isFloatingPoint())
12818 return "w";
12819
12820 if (ConstraintVT.isVector() &&
12821 (ConstraintVT.getSizeInBits() == 64 ||
12822 ConstraintVT.getSizeInBits() == 128))
12823 return "w";
12824
12825 return "r";
12826}
12827
12829
12830// Returns a {Reg, RegisterClass} tuple if the constraint is
12831// a specific predicate register.
12832//
12833// For some constraint like "{pn3}" the default path in
12834// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12835// suitable register class for this register is "PPRorPNR", after which it
12836// determines that nxv16i1 is an appropriate type for the constraint, which is
12837// not what we want. The code here pre-empts this by matching the register
12838// explicitly.
12839static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12841 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12842 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12843 return std::nullopt;
12844
12845 bool IsPredicate = Constraint[1] == 'p';
12846 Constraint = Constraint.substr(2, Constraint.size() - 3);
12847 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12848 if (IsPredicateAsCount)
12849 Constraint = Constraint.drop_front(1);
12850
12851 unsigned V;
12852 if (Constraint.getAsInteger(10, V) || V > 31)
12853 return std::nullopt;
12854
12855 if (IsPredicateAsCount)
12856 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12857 if (IsPredicate)
12858 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12859 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12860}
12861
12862static std::optional<PredicateConstraint>
12865 .Case("Uph", PredicateConstraint::Uph)
12868 .Default(std::nullopt);
12869}
12870
12871static const TargetRegisterClass *
12873 if (VT != MVT::aarch64svcount &&
12874 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12875 return nullptr;
12876
12877 switch (Constraint) {
12879 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12880 : &AArch64::PPR_p8to15RegClass;
12882 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12883 : &AArch64::PPR_3bRegClass;
12885 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12886 : &AArch64::PPRRegClass;
12887 }
12888
12889 llvm_unreachable("Missing PredicateConstraint!");
12890}
12891
12893
12894static std::optional<ReducedGprConstraint>
12897 .Case("Uci", ReducedGprConstraint::Uci)
12899 .Default(std::nullopt);
12900}
12901
12902static const TargetRegisterClass *
12904 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12905 return nullptr;
12906
12907 switch (Constraint) {
12909 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12911 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12912 }
12913
12914 llvm_unreachable("Missing ReducedGprConstraint!");
12915}
12916
12917// The set of cc code supported is from
12918// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12921 .Case("{@cchi}", AArch64CC::HI)
12922 .Case("{@cccs}", AArch64CC::HS)
12923 .Case("{@cclo}", AArch64CC::LO)
12924 .Case("{@ccls}", AArch64CC::LS)
12925 .Case("{@cccc}", AArch64CC::LO)
12926 .Case("{@cceq}", AArch64CC::EQ)
12927 .Case("{@ccgt}", AArch64CC::GT)
12928 .Case("{@ccge}", AArch64CC::GE)
12929 .Case("{@cclt}", AArch64CC::LT)
12930 .Case("{@ccle}", AArch64CC::LE)
12931 .Case("{@cchs}", AArch64CC::HS)
12932 .Case("{@ccne}", AArch64CC::NE)
12933 .Case("{@ccvc}", AArch64CC::VC)
12934 .Case("{@ccpl}", AArch64CC::PL)
12935 .Case("{@ccvs}", AArch64CC::VS)
12936 .Case("{@ccmi}", AArch64CC::MI)
12938 return Cond;
12939}
12940
12941/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12942/// WZR, invert(<cond>)'.
12944 SelectionDAG &DAG) {
12945 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12946 DAG.getConstant(0, DL, MVT::i32),
12947 DAG.getConstant(0, DL, MVT::i32),
12948 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12949}
12950
12951// Lower @cc flag output via getSETCC.
12952SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12953 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12954 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12955 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12956 if (Cond == AArch64CC::Invalid)
12957 return SDValue();
12958 // The output variable should be a scalar integer.
12959 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12960 OpInfo.ConstraintVT.getSizeInBits() < 8)
12961 report_fatal_error("Flag output operand is of invalid type");
12962
12963 // Get NZCV register. Only update chain when copyfrom is glued.
12964 if (Glue.getNode()) {
12965 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12966 Chain = Glue.getValue(1);
12967 } else
12968 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12969 // Extract CC code.
12970 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12971
12973
12974 // Truncate or ZERO_EXTEND based on value types.
12975 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12976 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12977 else
12978 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12979
12980 return Result;
12981}
12982
12983/// getConstraintType - Given a constraint letter, return the type of
12984/// constraint it is for this target.
12986AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12987 if (Constraint.size() == 1) {
12988 switch (Constraint[0]) {
12989 default:
12990 break;
12991 case 'x':
12992 case 'w':
12993 case 'y':
12994 return C_RegisterClass;
12995 // An address with a single base register. Due to the way we
12996 // currently handle addresses it is the same as 'r'.
12997 case 'Q':
12998 return C_Memory;
12999 case 'I':
13000 case 'J':
13001 case 'K':
13002 case 'L':
13003 case 'M':
13004 case 'N':
13005 case 'Y':
13006 case 'Z':
13007 return C_Immediate;
13008 case 'z':
13009 case 'S': // A symbol or label reference with a constant offset
13010 return C_Other;
13011 }
13012 } else if (parsePredicateConstraint(Constraint))
13013 return C_RegisterClass;
13014 else if (parseReducedGprConstraint(Constraint))
13015 return C_RegisterClass;
13016 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13017 return C_Other;
13018 return TargetLowering::getConstraintType(Constraint);
13019}
13020
13021/// Examine constraint type and operand type and determine a weight value.
13022/// This object must already have been set up with the operand type
13023/// and the current alternative constraint selected.
13025AArch64TargetLowering::getSingleConstraintMatchWeight(
13026 AsmOperandInfo &info, const char *constraint) const {
13028 Value *CallOperandVal = info.CallOperandVal;
13029 // If we don't have a value, we can't do a match,
13030 // but allow it at the lowest weight.
13031 if (!CallOperandVal)
13032 return CW_Default;
13033 Type *type = CallOperandVal->getType();
13034 // Look at the constraint type.
13035 switch (*constraint) {
13036 default:
13038 break;
13039 case 'x':
13040 case 'w':
13041 case 'y':
13042 if (type->isFloatingPointTy() || type->isVectorTy())
13043 weight = CW_Register;
13044 break;
13045 case 'z':
13046 weight = CW_Constant;
13047 break;
13048 case 'U':
13049 if (parsePredicateConstraint(constraint) ||
13050 parseReducedGprConstraint(constraint))
13051 weight = CW_Register;
13052 break;
13053 }
13054 return weight;
13055}
13056
13057std::pair<unsigned, const TargetRegisterClass *>
13058AArch64TargetLowering::getRegForInlineAsmConstraint(
13059 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13060 if (Constraint.size() == 1) {
13061 switch (Constraint[0]) {
13062 case 'r':
13063 if (VT.isScalableVector())
13064 return std::make_pair(0U, nullptr);
13065 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13066 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13067 if (VT.getFixedSizeInBits() == 64)
13068 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13069 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13070 case 'w': {
13071 if (!Subtarget->hasFPARMv8())
13072 break;
13073 if (VT.isScalableVector()) {
13074 if (VT.getVectorElementType() != MVT::i1)
13075 return std::make_pair(0U, &AArch64::ZPRRegClass);
13076 return std::make_pair(0U, nullptr);
13077 }
13078 if (VT == MVT::Other)
13079 break;
13080 uint64_t VTSize = VT.getFixedSizeInBits();
13081 if (VTSize == 16)
13082 return std::make_pair(0U, &AArch64::FPR16RegClass);
13083 if (VTSize == 32)
13084 return std::make_pair(0U, &AArch64::FPR32RegClass);
13085 if (VTSize == 64)
13086 return std::make_pair(0U, &AArch64::FPR64RegClass);
13087 if (VTSize == 128)
13088 return std::make_pair(0U, &AArch64::FPR128RegClass);
13089 break;
13090 }
13091 // The instructions that this constraint is designed for can
13092 // only take 128-bit registers so just use that regclass.
13093 case 'x':
13094 if (!Subtarget->hasFPARMv8())
13095 break;
13096 if (VT.isScalableVector())
13097 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13098 if (VT.getSizeInBits() == 128)
13099 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13100 break;
13101 case 'y':
13102 if (!Subtarget->hasFPARMv8())
13103 break;
13104 if (VT.isScalableVector())
13105 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13106 break;
13107 }
13108 } else {
13109 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13110 // SME functions that are not in streaming mode, should
13111 // still observe clobbers of Z-registers by clobbering
13112 // the lower 128bits of those registers.
13113 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13114 !Subtarget->isSVEorStreamingSVEAvailable())
13115 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13116 &AArch64::FPR128RegClass);
13117 return *P;
13118 }
13119 if (const auto PC = parsePredicateConstraint(Constraint))
13120 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13121 return std::make_pair(0U, RegClass);
13122
13123 if (const auto RGC = parseReducedGprConstraint(Constraint))
13124 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13125 return std::make_pair(0U, RegClass);
13126 }
13127 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13129 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13130
13131 if (Constraint == "{za}") {
13132 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13133 }
13134
13135 if (Constraint == "{zt0}") {
13136 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13137 }
13138
13139 // Use the default implementation in TargetLowering to convert the register
13140 // constraint into a member of a register class.
13141 std::pair<unsigned, const TargetRegisterClass *> Res;
13143
13144 // Not found as a standard register?
13145 if (!Res.second) {
13146 unsigned Size = Constraint.size();
13147 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13148 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13149 int RegNo;
13150 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13151 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13152 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13153 // By default we'll emit v0-v31 for this unless there's a modifier where
13154 // we'll emit the correct register as well.
13155 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13156 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13157 Res.second = &AArch64::FPR64RegClass;
13158 } else {
13159 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13160 Res.second = &AArch64::FPR128RegClass;
13161 }
13162 }
13163 }
13164 }
13165
13166 if (Res.second && !Subtarget->hasFPARMv8() &&
13167 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13168 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13169 return std::make_pair(0U, nullptr);
13170
13171 return Res;
13172}
13173
13175 llvm::Type *Ty,
13176 bool AllowUnknown) const {
13177 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13178 return EVT(MVT::i64x8);
13179
13180 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13181}
13182
13183/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13184/// vector. If it is invalid, don't add anything to Ops.
13185void AArch64TargetLowering::LowerAsmOperandForConstraint(
13186 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13187 SelectionDAG &DAG) const {
13188 SDValue Result;
13189
13190 // Currently only support length 1 constraints.
13191 if (Constraint.size() != 1)
13192 return;
13193
13194 char ConstraintLetter = Constraint[0];
13195 switch (ConstraintLetter) {
13196 default:
13197 break;
13198
13199 // This set of constraints deal with valid constants for various instructions.
13200 // Validate and return a target constant for them if we can.
13201 case 'z': {
13202 // 'z' maps to xzr or wzr so it needs an input of 0.
13203 if (!isNullConstant(Op))
13204 return;
13205
13206 if (Op.getValueType() == MVT::i64)
13207 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13208 else
13209 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13210 break;
13211 }
13212 case 'S':
13213 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13214 // supported for PIC while "s" isn't, making "s" less useful. We implement
13215 // "S" but not "s".
13217 break;
13218
13219 case 'I':
13220 case 'J':
13221 case 'K':
13222 case 'L':
13223 case 'M':
13224 case 'N':
13226 if (!C)
13227 return;
13228
13229 // Grab the value and do some validation.
13230 uint64_t CVal = C->getZExtValue();
13231 switch (ConstraintLetter) {
13232 // The I constraint applies only to simple ADD or SUB immediate operands:
13233 // i.e. 0 to 4095 with optional shift by 12
13234 // The J constraint applies only to ADD or SUB immediates that would be
13235 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13236 // instruction [or vice versa], in other words -1 to -4095 with optional
13237 // left shift by 12.
13238 case 'I':
13239 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13240 break;
13241 return;
13242 case 'J': {
13243 uint64_t NVal = -C->getSExtValue();
13244 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13245 CVal = C->getSExtValue();
13246 break;
13247 }
13248 return;
13249 }
13250 // The K and L constraints apply *only* to logical immediates, including
13251 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13252 // been removed and MOV should be used). So these constraints have to
13253 // distinguish between bit patterns that are valid 32-bit or 64-bit
13254 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13255 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13256 // versa.
13257 case 'K':
13258 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13259 break;
13260 return;
13261 case 'L':
13262 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13263 break;
13264 return;
13265 // The M and N constraints are a superset of K and L respectively, for use
13266 // with the MOV (immediate) alias. As well as the logical immediates they
13267 // also match 32 or 64-bit immediates that can be loaded either using a
13268 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13269 // (M) or 64-bit 0x1234000000000000 (N) etc.
13270 // As a note some of this code is liberally stolen from the asm parser.
13271 case 'M': {
13272 if (!isUInt<32>(CVal))
13273 return;
13274 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13275 break;
13276 if ((CVal & 0xFFFF) == CVal)
13277 break;
13278 if ((CVal & 0xFFFF0000ULL) == CVal)
13279 break;
13280 uint64_t NCVal = ~(uint32_t)CVal;
13281 if ((NCVal & 0xFFFFULL) == NCVal)
13282 break;
13283 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13284 break;
13285 return;
13286 }
13287 case 'N': {
13288 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13289 break;
13290 if ((CVal & 0xFFFFULL) == CVal)
13291 break;
13292 if ((CVal & 0xFFFF0000ULL) == CVal)
13293 break;
13294 if ((CVal & 0xFFFF00000000ULL) == CVal)
13295 break;
13296 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13297 break;
13298 uint64_t NCVal = ~CVal;
13299 if ((NCVal & 0xFFFFULL) == NCVal)
13300 break;
13301 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13302 break;
13303 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13304 break;
13305 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13306 break;
13307 return;
13308 }
13309 default:
13310 return;
13311 }
13312
13313 // All assembler immediates are 64-bit integers.
13314 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13315 break;
13316 }
13317
13318 if (Result.getNode()) {
13319 Ops.push_back(Result);
13320 return;
13321 }
13322
13323 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13324}
13325
13326//===----------------------------------------------------------------------===//
13327// AArch64 Advanced SIMD Support
13328//===----------------------------------------------------------------------===//
13329
13330/// WidenVector - Given a value in the V64 register class, produce the
13331/// equivalent value in the V128 register class.
13333 EVT VT = V64Reg.getValueType();
13334 unsigned NarrowSize = VT.getVectorNumElements();
13335 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13336 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13337 SDLoc DL(V64Reg);
13338
13339 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13340 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13341}
13342
13343/// getExtFactor - Determine the adjustment factor for the position when
13344/// generating an "extract from vector registers" instruction.
13345static unsigned getExtFactor(SDValue &V) {
13346 EVT EltType = V.getValueType().getVectorElementType();
13347 return EltType.getSizeInBits() / 8;
13348}
13349
13350// Check if a vector is built from one vector via extracted elements of
13351// another together with an AND mask, ensuring that all elements fit
13352// within range. This can be reconstructed using AND and NEON's TBL1.
13354 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13355 SDLoc DL(Op);
13356 EVT VT = Op.getValueType();
13357 assert(!VT.isScalableVector() &&
13358 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13359
13360 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13361 // directly to TBL1.
13362 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13363 return SDValue();
13364
13365 unsigned NumElts = VT.getVectorNumElements();
13366 assert((NumElts == 8 || NumElts == 16) &&
13367 "Need to have exactly 8 or 16 elements in vector.");
13368
13369 SDValue SourceVec;
13370 SDValue MaskSourceVec;
13371 SmallVector<SDValue, 16> AndMaskConstants;
13372
13373 for (unsigned i = 0; i < NumElts; ++i) {
13374 SDValue V = Op.getOperand(i);
13375 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13376 return SDValue();
13377
13378 SDValue OperandSourceVec = V.getOperand(0);
13379 if (!SourceVec)
13380 SourceVec = OperandSourceVec;
13381 else if (SourceVec != OperandSourceVec)
13382 return SDValue();
13383
13384 // This only looks at shuffles with elements that are
13385 // a) truncated by a constant AND mask extracted from a mask vector, or
13386 // b) extracted directly from a mask vector.
13387 SDValue MaskSource = V.getOperand(1);
13388 if (MaskSource.getOpcode() == ISD::AND) {
13389 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13390 return SDValue();
13391
13392 AndMaskConstants.push_back(MaskSource.getOperand(1));
13393 MaskSource = MaskSource->getOperand(0);
13394 } else if (!AndMaskConstants.empty()) {
13395 // Either all or no operands should have an AND mask.
13396 return SDValue();
13397 }
13398
13399 // An ANY_EXTEND may be inserted between the AND and the source vector
13400 // extraction. We don't care about that, so we can just skip it.
13401 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13402 MaskSource = MaskSource.getOperand(0);
13403
13404 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13405 return SDValue();
13406
13407 SDValue MaskIdx = MaskSource.getOperand(1);
13408 if (!isa<ConstantSDNode>(MaskIdx) ||
13409 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13410 return SDValue();
13411
13412 // We only apply this if all elements come from the same vector with the
13413 // same vector type.
13414 if (!MaskSourceVec) {
13415 MaskSourceVec = MaskSource->getOperand(0);
13416 if (MaskSourceVec.getValueType() != VT)
13417 return SDValue();
13418 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13419 return SDValue();
13420 }
13421 }
13422
13423 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13424 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13425 // insert, we know that the index in the mask must be smaller than the number
13426 // of elements in the source, or we would have an out-of-bounds access.
13427 if (NumElts == 8)
13428 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13429 DAG.getUNDEF(VT));
13430
13431 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13432 if (!AndMaskConstants.empty())
13433 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13434 DAG.getBuildVector(VT, DL, AndMaskConstants));
13435
13436 return DAG.getNode(
13438 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
13439 MaskSourceVec);
13440}
13441
13442// Gather data to see if the operation can be modelled as a
13443// shuffle in combination with VEXTs.
13445 SelectionDAG &DAG) const {
13446 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13447 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13448 SDLoc DL(Op);
13449 EVT VT = Op.getValueType();
13450 assert(!VT.isScalableVector() &&
13451 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13452 unsigned NumElts = VT.getVectorNumElements();
13453
13454 struct ShuffleSourceInfo {
13455 SDValue Vec;
13456 unsigned MinElt;
13457 unsigned MaxElt;
13458
13459 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13460 // be compatible with the shuffle we intend to construct. As a result
13461 // ShuffleVec will be some sliding window into the original Vec.
13462 SDValue ShuffleVec;
13463
13464 // Code should guarantee that element i in Vec starts at element "WindowBase
13465 // + i * WindowScale in ShuffleVec".
13466 int WindowBase;
13467 int WindowScale;
13468
13469 ShuffleSourceInfo(SDValue Vec)
13470 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13471 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13472
13473 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13474 };
13475
13476 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13477 // node.
13479 for (unsigned i = 0; i < NumElts; ++i) {
13480 SDValue V = Op.getOperand(i);
13481 if (V.isUndef())
13482 continue;
13483 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13484 !isa<ConstantSDNode>(V.getOperand(1)) ||
13485 V.getOperand(0).getValueType().isScalableVector()) {
13486 LLVM_DEBUG(
13487 dbgs() << "Reshuffle failed: "
13488 "a shuffle can only come from building a vector from "
13489 "various elements of other fixed-width vectors, provided "
13490 "their indices are constant\n");
13491 return SDValue();
13492 }
13493
13494 // Add this element source to the list if it's not already there.
13495 SDValue SourceVec = V.getOperand(0);
13496 auto Source = find(Sources, SourceVec);
13497 if (Source == Sources.end())
13498 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13499
13500 // Update the minimum and maximum lane number seen.
13501 unsigned EltNo = V.getConstantOperandVal(1);
13502 Source->MinElt = std::min(Source->MinElt, EltNo);
13503 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13504 }
13505
13506 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13507 // better than moving to/from gpr registers for larger vectors.
13508 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13509 // Construct a mask for the tbl. We may need to adjust the index for types
13510 // larger than i8.
13512 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13513 for (unsigned I = 0; I < NumElts; ++I) {
13514 SDValue V = Op.getOperand(I);
13515 if (V.isUndef()) {
13516 for (unsigned OF = 0; OF < OutputFactor; OF++)
13517 Mask.push_back(-1);
13518 continue;
13519 }
13520 // Set the Mask lanes adjusted for the size of the input and output
13521 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13522 // output element, adjusted in their positions per input and output types.
13523 unsigned Lane = V.getConstantOperandVal(1);
13524 for (unsigned S = 0; S < Sources.size(); S++) {
13525 if (V.getOperand(0) == Sources[S].Vec) {
13526 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13527 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13528 for (unsigned OF = 0; OF < OutputFactor; OF++)
13529 Mask.push_back(InputBase + OF);
13530 break;
13531 }
13532 }
13533 }
13534
13535 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13536 // v16i8, and the TBLMask
13537 SmallVector<SDValue, 16> TBLOperands;
13538 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13539 ? Intrinsic::aarch64_neon_tbl3
13540 : Intrinsic::aarch64_neon_tbl4,
13541 DL, MVT::i32));
13542 for (unsigned i = 0; i < Sources.size(); i++) {
13543 SDValue Src = Sources[i].Vec;
13544 EVT SrcVT = Src.getValueType();
13545 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13546 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13547 "Expected a legally typed vector");
13548 if (SrcVT.is64BitVector())
13549 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13550 DAG.getUNDEF(MVT::v8i8));
13551 TBLOperands.push_back(Src);
13552 }
13553
13555 for (unsigned i = 0; i < Mask.size(); i++)
13556 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13557 assert((Mask.size() == 8 || Mask.size() == 16) &&
13558 "Expected a v8i8 or v16i8 Mask");
13559 TBLOperands.push_back(DAG.getBuildVector(
13560 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13561
13562 SDValue Shuffle =
13564 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13565 return DAG.getBitcast(VT, Shuffle);
13566 }
13567
13568 if (Sources.size() > 2) {
13569 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13570 << "sensible when at most two source vectors are "
13571 << "involved\n");
13572 return SDValue();
13573 }
13574
13575 // Find out the smallest element size among result and two sources, and use
13576 // it as element size to build the shuffle_vector.
13577 EVT SmallestEltTy = VT.getVectorElementType();
13578 for (auto &Source : Sources) {
13579 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13580 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13581 SmallestEltTy = SrcEltTy;
13582 }
13583 }
13584 unsigned ResMultiplier =
13585 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13586 uint64_t VTSize = VT.getFixedSizeInBits();
13587 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13588 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13589
13590 // If the source vector is too wide or too narrow, we may nevertheless be able
13591 // to construct a compatible shuffle either by concatenating it with UNDEF or
13592 // extracting a suitable range of elements.
13593 for (auto &Src : Sources) {
13594 EVT SrcVT = Src.ShuffleVec.getValueType();
13595
13596 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13597 if (SrcVTSize == TypeSize::getFixed(VTSize))
13598 continue;
13599
13600 // This stage of the search produces a source with the same element type as
13601 // the original, but with a total width matching the BUILD_VECTOR output.
13602 EVT EltVT = SrcVT.getVectorElementType();
13603 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13604 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13605
13606 if (SrcVTSize.getFixedValue() < VTSize) {
13607 assert(2 * SrcVTSize == VTSize);
13608 // We can pad out the smaller vector for free, so if it's part of a
13609 // shuffle...
13610 Src.ShuffleVec =
13611 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13612 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13613 continue;
13614 }
13615
13616 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13617 LLVM_DEBUG(
13618 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13619 return SDValue();
13620 }
13621
13622 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13623 LLVM_DEBUG(
13624 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13625 return SDValue();
13626 }
13627
13628 if (Src.MinElt >= NumSrcElts) {
13629 // The extraction can just take the second half
13630 Src.ShuffleVec =
13631 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13632 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13633 Src.WindowBase = -NumSrcElts;
13634 } else if (Src.MaxElt < NumSrcElts) {
13635 // The extraction can just take the first half
13636 Src.ShuffleVec =
13637 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13638 DAG.getConstant(0, DL, MVT::i64));
13639 } else {
13640 // An actual VEXT is needed
13641 SDValue VEXTSrc1 =
13642 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13643 DAG.getConstant(0, DL, MVT::i64));
13644 SDValue VEXTSrc2 =
13645 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13646 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13647 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13648
13649 if (!SrcVT.is64BitVector()) {
13650 LLVM_DEBUG(
13651 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13652 "for SVE vectors.");
13653 return SDValue();
13654 }
13655
13656 Src.ShuffleVec =
13657 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13658 DAG.getConstant(Imm, DL, MVT::i32));
13659 Src.WindowBase = -Src.MinElt;
13660 }
13661 }
13662
13663 // Another possible incompatibility occurs from the vector element types. We
13664 // can fix this by bitcasting the source vectors to the same type we intend
13665 // for the shuffle.
13666 for (auto &Src : Sources) {
13667 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13668 if (SrcEltTy == SmallestEltTy)
13669 continue;
13670 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13671 if (DAG.getDataLayout().isBigEndian()) {
13672 Src.ShuffleVec =
13673 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13674 } else {
13675 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13676 }
13677 Src.WindowScale =
13678 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13679 Src.WindowBase *= Src.WindowScale;
13680 }
13681
13682 // Final check before we try to actually produce a shuffle.
13683 LLVM_DEBUG({
13684 for (auto Src : Sources)
13685 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13686 });
13687
13688 // The stars all align, our next step is to produce the mask for the shuffle.
13689 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13690 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13691 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13692 SDValue Entry = Op.getOperand(i);
13693 if (Entry.isUndef())
13694 continue;
13695
13696 auto Src = find(Sources, Entry.getOperand(0));
13697 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13698
13699 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13700 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13701 // segment.
13702 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13703 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13704 VT.getScalarSizeInBits());
13705 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13706
13707 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13708 // starting at the appropriate offset.
13709 int *LaneMask = &Mask[i * ResMultiplier];
13710
13711 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13712 ExtractBase += NumElts * (Src - Sources.begin());
13713 for (int j = 0; j < LanesDefined; ++j)
13714 LaneMask[j] = ExtractBase + j;
13715 }
13716
13717 // Final check before we try to produce nonsense...
13718 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13719 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13720 return SDValue();
13721 }
13722
13723 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13724 for (unsigned i = 0; i < Sources.size(); ++i)
13725 ShuffleOps[i] = Sources[i].ShuffleVec;
13726
13727 SDValue Shuffle =
13728 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13729 SDValue V;
13730 if (DAG.getDataLayout().isBigEndian()) {
13731 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13732 } else {
13733 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13734 }
13735
13736 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13737 dbgs() << "Reshuffle, creating node: "; V.dump(););
13738
13739 return V;
13740}
13741
13742// check if an EXT instruction can handle the shuffle mask when the
13743// vector sources of the shuffle are the same.
13744static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13745 unsigned NumElts = VT.getVectorNumElements();
13746
13747 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13748 if (M[0] < 0)
13749 return false;
13750
13751 Imm = M[0];
13752
13753 // If this is a VEXT shuffle, the immediate value is the index of the first
13754 // element. The other shuffle indices must be the successive elements after
13755 // the first one.
13756 unsigned ExpectedElt = Imm;
13757 for (unsigned i = 1; i < NumElts; ++i) {
13758 // Increment the expected index. If it wraps around, just follow it
13759 // back to index zero and keep going.
13760 ++ExpectedElt;
13761 if (ExpectedElt == NumElts)
13762 ExpectedElt = 0;
13763
13764 if (M[i] < 0)
13765 continue; // ignore UNDEF indices
13766 if (ExpectedElt != static_cast<unsigned>(M[i]))
13767 return false;
13768 }
13769
13770 return true;
13771}
13772
13773// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13774// v4i32s. This is really a truncate, which we can construct out of (legal)
13775// concats and truncate nodes.
13777 if (V.getValueType() != MVT::v16i8)
13778 return SDValue();
13779 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13780
13781 for (unsigned X = 0; X < 4; X++) {
13782 // Check the first item in each group is an extract from lane 0 of a v4i32
13783 // or v4i16.
13784 SDValue BaseExt = V.getOperand(X * 4);
13785 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13786 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13787 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13788 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13789 BaseExt.getConstantOperandVal(1) != 0)
13790 return SDValue();
13791 SDValue Base = BaseExt.getOperand(0);
13792 // And check the other items are extracts from the same vector.
13793 for (unsigned Y = 1; Y < 4; Y++) {
13794 SDValue Ext = V.getOperand(X * 4 + Y);
13795 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13796 Ext.getOperand(0) != Base ||
13797 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13798 Ext.getConstantOperandVal(1) != Y)
13799 return SDValue();
13800 }
13801 }
13802
13803 // Turn the buildvector into a series of truncates and concates, which will
13804 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13805 // concat together to produce 2 v8i16. These are both truncated and concat
13806 // together.
13807 SDLoc DL(V);
13808 SDValue Trunc[4] = {
13809 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13810 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13811 for (SDValue &V : Trunc)
13812 if (V.getValueType() == MVT::v4i32)
13813 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13814 SDValue Concat0 =
13815 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13816 SDValue Concat1 =
13817 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13818 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13819 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13820 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13821}
13822
13823/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13824/// element width than the vector lane type. If that is the case the function
13825/// returns true and writes the value of the DUP instruction lane operand into
13826/// DupLaneOp
13827static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13828 unsigned &DupLaneOp) {
13829 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13830 "Only possible block sizes for wide DUP are: 16, 32, 64");
13831
13832 if (BlockSize <= VT.getScalarSizeInBits())
13833 return false;
13834 if (BlockSize % VT.getScalarSizeInBits() != 0)
13835 return false;
13836 if (VT.getSizeInBits() % BlockSize != 0)
13837 return false;
13838
13839 size_t SingleVecNumElements = VT.getVectorNumElements();
13840 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13841 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13842
13843 // We are looking for masks like
13844 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13845 // might be replaced by 'undefined'. BlockIndices will eventually contain
13846 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13847 // for the above examples)
13848 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13849 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13850 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13851 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13852 if (Elt < 0)
13853 continue;
13854 // For now we don't support shuffles that use the second operand
13855 if ((unsigned)Elt >= SingleVecNumElements)
13856 return false;
13857 if (BlockElts[I] < 0)
13858 BlockElts[I] = Elt;
13859 else if (BlockElts[I] != Elt)
13860 return false;
13861 }
13862
13863 // We found a candidate block (possibly with some undefs). It must be a
13864 // sequence of consecutive integers starting with a value divisible by
13865 // NumEltsPerBlock with some values possibly replaced by undef-s.
13866
13867 // Find first non-undef element
13868 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13869 assert(FirstRealEltIter != BlockElts.end() &&
13870 "Shuffle with all-undefs must have been caught by previous cases, "
13871 "e.g. isSplat()");
13872 if (FirstRealEltIter == BlockElts.end()) {
13873 DupLaneOp = 0;
13874 return true;
13875 }
13876
13877 // Index of FirstRealElt in BlockElts
13878 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13879
13880 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13881 return false;
13882 // BlockElts[0] must have the following value if it isn't undef:
13883 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13884
13885 // Check the first element
13886 if (Elt0 % NumEltsPerBlock != 0)
13887 return false;
13888 // Check that the sequence indeed consists of consecutive integers (modulo
13889 // undefs)
13890 for (size_t I = 0; I < NumEltsPerBlock; I++)
13891 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13892 return false;
13893
13894 DupLaneOp = Elt0 / NumEltsPerBlock;
13895 return true;
13896}
13897
13898// check if an EXT instruction can handle the shuffle mask when the
13899// vector sources of the shuffle are different.
13900static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13901 unsigned &Imm) {
13902 // Look for the first non-undef element.
13903 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13904
13905 // Benefit from APInt to handle overflow when calculating expected element.
13906 unsigned NumElts = VT.getVectorNumElements();
13907 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13908 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13909 /*implicitTrunc=*/true);
13910 // The following shuffle indices must be the successive elements after the
13911 // first real element.
13912 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13913 return Elt != ExpectedElt++ && Elt >= 0;
13914 });
13915 if (FoundWrongElt)
13916 return false;
13917
13918 // The index of an EXT is the first element if it is not UNDEF.
13919 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13920 // value of the first element. E.g.
13921 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13922 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13923 // ExpectedElt is the last mask index plus 1.
13924 Imm = ExpectedElt.getZExtValue();
13925
13926 // There are two difference cases requiring to reverse input vectors.
13927 // For example, for vector <4 x i32> we have the following cases,
13928 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13929 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13930 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13931 // to reverse two input vectors.
13932 if (Imm < NumElts)
13933 ReverseEXT = true;
13934 else
13935 Imm -= NumElts;
13936
13937 return true;
13938}
13939
13940/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13941/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13942/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13943static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13944 unsigned NumElts = VT.getVectorNumElements();
13945 if (NumElts % 2 != 0)
13946 return false;
13947 WhichResult = (M[0] == 0 ? 0 : 1);
13948 unsigned Idx = WhichResult * NumElts / 2;
13949 for (unsigned i = 0; i != NumElts; i += 2) {
13950 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13951 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13952 return false;
13953 Idx += 1;
13954 }
13955
13956 return true;
13957}
13958
13959/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13960/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13961/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13962static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13963 unsigned Half = VT.getVectorNumElements() / 2;
13964 WhichResult = (M[0] == 0 ? 0 : 1);
13965 for (unsigned j = 0; j != 2; ++j) {
13966 unsigned Idx = WhichResult;
13967 for (unsigned i = 0; i != Half; ++i) {
13968 int MIdx = M[i + j * Half];
13969 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13970 return false;
13971 Idx += 2;
13972 }
13973 }
13974
13975 return true;
13976}
13977
13978/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13979/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13980/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13981static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13982 unsigned NumElts = VT.getVectorNumElements();
13983 if (NumElts % 2 != 0)
13984 return false;
13985 WhichResult = (M[0] == 0 ? 0 : 1);
13986 for (unsigned i = 0; i < NumElts; i += 2) {
13987 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13988 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13989 return false;
13990 }
13991 return true;
13992}
13993
13994static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13995 bool &DstIsLeft, int &Anomaly) {
13996 if (M.size() != static_cast<size_t>(NumInputElements))
13997 return false;
13998
13999 int NumLHSMatch = 0, NumRHSMatch = 0;
14000 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14001
14002 for (int i = 0; i < NumInputElements; ++i) {
14003 if (M[i] == -1) {
14004 ++NumLHSMatch;
14005 ++NumRHSMatch;
14006 continue;
14007 }
14008
14009 if (M[i] == i)
14010 ++NumLHSMatch;
14011 else
14012 LastLHSMismatch = i;
14013
14014 if (M[i] == i + NumInputElements)
14015 ++NumRHSMatch;
14016 else
14017 LastRHSMismatch = i;
14018 }
14019
14020 if (NumLHSMatch == NumInputElements - 1) {
14021 DstIsLeft = true;
14022 Anomaly = LastLHSMismatch;
14023 return true;
14024 } else if (NumRHSMatch == NumInputElements - 1) {
14025 DstIsLeft = false;
14026 Anomaly = LastRHSMismatch;
14027 return true;
14028 }
14029
14030 return false;
14031}
14032
14033static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14034 if (VT.getSizeInBits() != 128)
14035 return false;
14036
14037 unsigned NumElts = VT.getVectorNumElements();
14038
14039 for (int I = 0, E = NumElts / 2; I != E; I++) {
14040 if (Mask[I] != I)
14041 return false;
14042 }
14043
14044 int Offset = NumElts / 2;
14045 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14046 if (Mask[I] != I + SplitLHS * Offset)
14047 return false;
14048 }
14049
14050 return true;
14051}
14052
14054 SDLoc DL(Op);
14055 EVT VT = Op.getValueType();
14056 SDValue V0 = Op.getOperand(0);
14057 SDValue V1 = Op.getOperand(1);
14058 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14059
14062 return SDValue();
14063
14064 bool SplitV0 = V0.getValueSizeInBits() == 128;
14065
14066 if (!isConcatMask(Mask, VT, SplitV0))
14067 return SDValue();
14068
14069 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14070 if (SplitV0) {
14071 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14072 DAG.getConstant(0, DL, MVT::i64));
14073 }
14074 if (V1.getValueSizeInBits() == 128) {
14075 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14076 DAG.getConstant(0, DL, MVT::i64));
14077 }
14078 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14079}
14080
14081/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14082/// the specified operations to build the shuffle. ID is the perfect-shuffle
14083//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14084//table entry and LHS/RHS are the immediate inputs for this stage of the
14085//shuffle.
14087 unsigned PFEntry, SDValue LHS,
14088 SDValue RHS, SelectionDAG &DAG,
14089 const SDLoc &DL) {
14090 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14091 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14092 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14093
14094 enum {
14095 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14096 OP_VREV,
14097 OP_VDUP0,
14098 OP_VDUP1,
14099 OP_VDUP2,
14100 OP_VDUP3,
14101 OP_VEXT1,
14102 OP_VEXT2,
14103 OP_VEXT3,
14104 OP_VUZPL, // VUZP, left result
14105 OP_VUZPR, // VUZP, right result
14106 OP_VZIPL, // VZIP, left result
14107 OP_VZIPR, // VZIP, right result
14108 OP_VTRNL, // VTRN, left result
14109 OP_VTRNR, // VTRN, right result
14110 OP_MOVLANE // Move lane. RHSID is the lane to move into
14111 };
14112
14113 if (OpNum == OP_COPY) {
14114 if (LHSID == (1 * 9 + 2) * 9 + 3)
14115 return LHS;
14116 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14117 return RHS;
14118 }
14119
14120 if (OpNum == OP_MOVLANE) {
14121 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14122 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14123 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14124 Elt = 3 - Elt;
14125 while (Elt > 0) {
14126 ID /= 9;
14127 Elt--;
14128 }
14129 return (ID % 9 == 8) ? -1 : ID % 9;
14130 };
14131
14132 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14133 // get the lane to move from the PFID, which is always from the
14134 // original vectors (V1 or V2).
14136 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14137 EVT VT = OpLHS.getValueType();
14138 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14139 unsigned ExtLane = 0;
14140 SDValue Input;
14141
14142 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14143 // convert into a higher type.
14144 if (RHSID & 0x4) {
14145 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14146 if (MaskElt == -1)
14147 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14148 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14149 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14150 Input = MaskElt < 2 ? V1 : V2;
14151 if (VT.getScalarSizeInBits() == 16) {
14152 Input = DAG.getBitcast(MVT::v2f32, Input);
14153 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14154 } else {
14155 assert(VT.getScalarSizeInBits() == 32 &&
14156 "Expected 16 or 32 bit shuffle elements");
14157 Input = DAG.getBitcast(MVT::v2f64, Input);
14158 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14159 }
14160 } else {
14161 int MaskElt = getPFIDLane(ID, RHSID);
14162 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14163 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14164 Input = MaskElt < 4 ? V1 : V2;
14165 // Be careful about creating illegal types. Use f16 instead of i16.
14166 if (VT == MVT::v4i16) {
14167 Input = DAG.getBitcast(MVT::v4f16, Input);
14168 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14169 }
14170 }
14172 Input.getValueType().getVectorElementType(),
14173 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14174 SDValue Ins =
14175 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14176 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14177 return DAG.getBitcast(VT, Ins);
14178 }
14179
14180 SDValue OpLHS, OpRHS;
14181 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14182 RHS, DAG, DL);
14183 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14184 RHS, DAG, DL);
14185 EVT VT = OpLHS.getValueType();
14186
14187 switch (OpNum) {
14188 default:
14189 llvm_unreachable("Unknown shuffle opcode!");
14190 case OP_VREV:
14191 // VREV divides the vector in half and swaps within the half.
14192 if (VT.getVectorElementType() == MVT::i32 ||
14193 VT.getVectorElementType() == MVT::f32)
14194 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14195 // vrev <4 x i16> -> REV32
14196 if (VT.getVectorElementType() == MVT::i16 ||
14197 VT.getVectorElementType() == MVT::f16 ||
14198 VT.getVectorElementType() == MVT::bf16)
14199 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14200 // vrev <4 x i8> -> REV16
14201 assert(VT.getVectorElementType() == MVT::i8);
14202 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14203 case OP_VDUP0:
14204 case OP_VDUP1:
14205 case OP_VDUP2:
14206 case OP_VDUP3: {
14207 EVT EltTy = VT.getVectorElementType();
14208 unsigned Opcode;
14209 if (EltTy == MVT::i8)
14210 Opcode = AArch64ISD::DUPLANE8;
14211 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14212 Opcode = AArch64ISD::DUPLANE16;
14213 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14214 Opcode = AArch64ISD::DUPLANE32;
14215 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14216 Opcode = AArch64ISD::DUPLANE64;
14217 else
14218 llvm_unreachable("Invalid vector element type?");
14219
14220 if (VT.getSizeInBits() == 64)
14221 OpLHS = WidenVector(OpLHS, DAG);
14222 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14223 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14224 }
14225 case OP_VEXT1:
14226 case OP_VEXT2:
14227 case OP_VEXT3: {
14228 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14229 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14230 DAG.getConstant(Imm, DL, MVT::i32));
14231 }
14232 case OP_VUZPL:
14233 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14234 case OP_VUZPR:
14235 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14236 case OP_VZIPL:
14237 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14238 case OP_VZIPR:
14239 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14240 case OP_VTRNL:
14241 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14242 case OP_VTRNR:
14243 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14244 }
14245}
14246
14248 SelectionDAG &DAG) {
14249 // Check to see if we can use the TBL instruction.
14250 SDValue V1 = Op.getOperand(0);
14251 SDValue V2 = Op.getOperand(1);
14252 SDLoc DL(Op);
14253
14254 EVT EltVT = Op.getValueType().getVectorElementType();
14255 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14256
14257 bool Swap = false;
14258 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14259 std::swap(V1, V2);
14260 Swap = true;
14261 }
14262
14263 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14264 // out of range values with 0s. We do need to make sure that any out-of-range
14265 // values are really out-of-range for a v16i8 vector.
14266 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14267 MVT IndexVT = MVT::v8i8;
14268 unsigned IndexLen = 8;
14269 if (Op.getValueSizeInBits() == 128) {
14270 IndexVT = MVT::v16i8;
14271 IndexLen = 16;
14272 }
14273
14275 for (int Val : ShuffleMask) {
14276 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14277 unsigned Offset = Byte + Val * BytesPerElt;
14278 if (Swap)
14279 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14280 if (IsUndefOrZero && Offset >= IndexLen)
14281 Offset = 255;
14282 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14283 }
14284 }
14285
14286 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14287 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14288
14289 SDValue Shuffle;
14290 if (IsUndefOrZero) {
14291 if (IndexLen == 8)
14292 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14293 Shuffle = DAG.getNode(
14294 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14295 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14296 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14297 } else {
14298 if (IndexLen == 8) {
14299 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14300 Shuffle = DAG.getNode(
14301 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14302 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14303 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14304 } else {
14305 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14306 // cannot currently represent the register constraints on the input
14307 // table registers.
14308 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14309 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14310 // IndexLen));
14311 Shuffle = DAG.getNode(
14312 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14313 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
14314 V2Cst,
14315 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14316 }
14317 }
14318 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14319}
14320
14321static unsigned getDUPLANEOp(EVT EltType) {
14322 if (EltType == MVT::i8)
14323 return AArch64ISD::DUPLANE8;
14324 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14325 return AArch64ISD::DUPLANE16;
14326 if (EltType == MVT::i32 || EltType == MVT::f32)
14327 return AArch64ISD::DUPLANE32;
14328 if (EltType == MVT::i64 || EltType == MVT::f64)
14329 return AArch64ISD::DUPLANE64;
14330
14331 llvm_unreachable("Invalid vector element type?");
14332}
14333
14334static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14335 unsigned Opcode, SelectionDAG &DAG) {
14336 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14337 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14338 // Match: dup (bitcast (extract_subv X, C)), LaneC
14339 if (BitCast.getOpcode() != ISD::BITCAST ||
14341 return false;
14342
14343 // The extract index must align in the destination type. That may not
14344 // happen if the bitcast is from narrow to wide type.
14345 SDValue Extract = BitCast.getOperand(0);
14346 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14347 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14348 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14349 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14350 if (ExtIdxInBits % CastedEltBitWidth != 0)
14351 return false;
14352
14353 // Can't handle cases where vector size is not 128-bit
14354 if (!Extract.getOperand(0).getValueType().is128BitVector())
14355 return false;
14356
14357 // Update the lane value by offsetting with the scaled extract index.
14358 LaneC += ExtIdxInBits / CastedEltBitWidth;
14359
14360 // Determine the casted vector type of the wide vector input.
14361 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14362 // Examples:
14363 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14364 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14365 unsigned SrcVecNumElts =
14366 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14368 SrcVecNumElts);
14369 return true;
14370 };
14371 MVT CastVT;
14372 if (getScaledOffsetDup(V, Lane, CastVT)) {
14373 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14374 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14375 V.getOperand(0).getValueType().is128BitVector()) {
14376 // The lane is incremented by the index of the extract.
14377 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14378 Lane += V.getConstantOperandVal(1);
14379 V = V.getOperand(0);
14380 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14381 // The lane is decremented if we are splatting from the 2nd operand.
14382 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14383 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14384 Lane -= Idx * VT.getVectorNumElements() / 2;
14385 V = WidenVector(V.getOperand(Idx), DAG);
14386 } else if (VT.getSizeInBits() == 64) {
14387 // Widen the operand to 128-bit register with undef.
14388 V = WidenVector(V, DAG);
14389 }
14390 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14391}
14392
14393// Try to widen element type to get a new mask value for a better permutation
14394// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14395// UZP1/2, TRN1/2, REV, INS, etc.
14396// For example:
14397// shufflevector <4 x i32> %a, <4 x i32> %b,
14398// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14399// is equivalent to:
14400// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14401// Finally, we can get:
14402// mov v0.d[0], v1.d[1]
14404 SDLoc DL(Op);
14405 EVT VT = Op.getValueType();
14406 EVT ScalarVT = VT.getVectorElementType();
14407 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14408 SDValue V0 = Op.getOperand(0);
14409 SDValue V1 = Op.getOperand(1);
14410 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14411
14412 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14413 // We need to make sure the wider element type is legal. Thus, ElementSize
14414 // should be not larger than 32 bits, and i1 type should also be excluded.
14415 if (ElementSize > 32 || ElementSize == 1)
14416 return SDValue();
14417
14418 SmallVector<int, 8> NewMask;
14419 if (widenShuffleMaskElts(Mask, NewMask)) {
14420 MVT NewEltVT = VT.isFloatingPoint()
14421 ? MVT::getFloatingPointVT(ElementSize * 2)
14422 : MVT::getIntegerVT(ElementSize * 2);
14423 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14424 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14425 V0 = DAG.getBitcast(NewVT, V0);
14426 V1 = DAG.getBitcast(NewVT, V1);
14427 return DAG.getBitcast(VT,
14428 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14429 }
14430 }
14431
14432 return SDValue();
14433}
14434
14435// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14437 ArrayRef<int> ShuffleMask,
14438 SelectionDAG &DAG) {
14439 SDValue Tbl1 = Op->getOperand(0);
14440 SDValue Tbl2 = Op->getOperand(1);
14441 SDLoc DL(Op);
14442 SDValue Tbl2ID =
14443 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14444
14445 EVT VT = Op.getValueType();
14446 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14447 Tbl1.getOperand(0) != Tbl2ID ||
14449 Tbl2.getOperand(0) != Tbl2ID)
14450 return SDValue();
14451
14452 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14453 return SDValue();
14454
14455 SDValue Mask1 = Tbl1.getOperand(3);
14456 SDValue Mask2 = Tbl2.getOperand(3);
14457 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14458 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14459 return SDValue();
14460
14461 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14462 for (unsigned I = 0; I < 16; I++) {
14463 if (ShuffleMask[I] < 16)
14464 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14465 else {
14466 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14467 if (!C)
14468 return SDValue();
14469 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14470 }
14471 }
14472
14473 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14474 SDValue ID =
14475 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14476
14477 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14478 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14479 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14480}
14481
14482// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14483// but we don't have an appropriate instruction,
14484// so custom-lower it as ZIP1-with-zeros.
14485SDValue
14486AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14487 SelectionDAG &DAG) const {
14488 SDLoc DL(Op);
14489 EVT VT = Op.getValueType();
14490 SDValue SrcOp = Op.getOperand(0);
14491 EVT SrcVT = SrcOp.getValueType();
14492 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14493 "Unexpected extension factor.");
14494 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14495 // FIXME: support multi-step zipping?
14496 if (Scale != 2)
14497 return SDValue();
14498 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14499 return DAG.getBitcast(VT,
14500 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14501}
14502
14503SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14504 SelectionDAG &DAG) const {
14505 SDLoc DL(Op);
14506 EVT VT = Op.getValueType();
14507
14508 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14509
14510 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14511 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14512
14513 // Convert shuffles that are directly supported on NEON to target-specific
14514 // DAG nodes, instead of keeping them as shuffles and matching them again
14515 // during code selection. This is more efficient and avoids the possibility
14516 // of inconsistencies between legalization and selection.
14517 ArrayRef<int> ShuffleMask = SVN->getMask();
14518
14519 SDValue V1 = Op.getOperand(0);
14520 SDValue V2 = Op.getOperand(1);
14521
14522 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14523 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14524 "Unexpected VECTOR_SHUFFLE mask size!");
14525
14526 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14527 return Res;
14528
14529 if (SVN->isSplat()) {
14530 int Lane = SVN->getSplatIndex();
14531 // If this is undef splat, generate it via "just" vdup, if possible.
14532 if (Lane == -1)
14533 Lane = 0;
14534
14535 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14536 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14537 V1.getOperand(0));
14538 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14539 // constant. If so, we can just reference the lane's definition directly.
14540 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14542 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14543
14544 // Otherwise, duplicate from the lane of the input vector.
14545 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14546 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14547 }
14548
14549 // Check if the mask matches a DUP for a wider element
14550 for (unsigned LaneSize : {64U, 32U, 16U}) {
14551 unsigned Lane = 0;
14552 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14553 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14554 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14555 : AArch64ISD::DUPLANE16;
14556 // Cast V1 to an integer vector with required lane size
14557 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14558 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14559 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14560 V1 = DAG.getBitcast(NewVecTy, V1);
14561 // Construct the DUP instruction
14562 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14563 // Cast back to the original type
14564 return DAG.getBitcast(VT, V1);
14565 }
14566 }
14567
14568 unsigned NumElts = VT.getVectorNumElements();
14569 unsigned EltSize = VT.getScalarSizeInBits();
14570 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14571 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14572 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14573 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14574 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14575 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14576
14577 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14578 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14579 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14580 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14581 DAG.getConstant(8, DL, MVT::i32));
14582 }
14583
14584 bool ReverseEXT = false;
14585 unsigned Imm;
14586 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14587 if (ReverseEXT)
14588 std::swap(V1, V2);
14589 Imm *= getExtFactor(V1);
14590 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14591 DAG.getConstant(Imm, DL, MVT::i32));
14592 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14593 Imm *= getExtFactor(V1);
14594 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14595 DAG.getConstant(Imm, DL, MVT::i32));
14596 }
14597
14598 unsigned WhichResult;
14599 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14600 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14601 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14602 }
14603 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14604 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14605 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14606 }
14607 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14608 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14609 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14610 }
14611
14612 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14613 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14614 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14615 }
14616 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14617 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14618 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14619 }
14620 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14621 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14622 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14623 }
14624
14626 return Concat;
14627
14628 bool DstIsLeft;
14629 int Anomaly;
14630 int NumInputElements = V1.getValueType().getVectorNumElements();
14631 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14632 SDValue DstVec = DstIsLeft ? V1 : V2;
14633 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14634
14635 SDValue SrcVec = V1;
14636 int SrcLane = ShuffleMask[Anomaly];
14637 if (SrcLane >= NumInputElements) {
14638 SrcVec = V2;
14639 SrcLane -= NumElts;
14640 }
14641 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14642
14643 EVT ScalarVT = VT.getVectorElementType();
14644
14645 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14646 ScalarVT = MVT::i32;
14647
14648 return DAG.getNode(
14649 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14650 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14651 DstLaneV);
14652 }
14653
14654 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14655 return NewSD;
14656
14657 // If the shuffle is not directly supported and it has 4 elements, use
14658 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14659 if (NumElts == 4) {
14660 unsigned PFIndexes[4];
14661 for (unsigned i = 0; i != 4; ++i) {
14662 if (ShuffleMask[i] < 0)
14663 PFIndexes[i] = 8;
14664 else
14665 PFIndexes[i] = ShuffleMask[i];
14666 }
14667
14668 // Compute the index in the perfect shuffle table.
14669 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14670 PFIndexes[2] * 9 + PFIndexes[3];
14671 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14672 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14673 DL);
14674 }
14675
14676 // Check for a "select shuffle", generating a BSL to pick between lanes in
14677 // V1/V2.
14678 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14679 assert(VT.getScalarSizeInBits() <= 32 &&
14680 "Expected larger vector element sizes to be handled already");
14681 SmallVector<SDValue> MaskElts;
14682 for (int M : ShuffleMask)
14683 MaskElts.push_back(DAG.getConstant(
14684 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14685 EVT IVT = VT.changeVectorElementTypeToInteger();
14686 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14687 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14688 DAG.getBitcast(IVT, V1),
14689 DAG.getBitcast(IVT, V2)));
14690 }
14691
14692 // Fall back to generating a TBL
14693 return GenerateTBL(Op, ShuffleMask, DAG);
14694}
14695
14696SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14697 SelectionDAG &DAG) const {
14698 EVT VT = Op.getValueType();
14699
14700 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14701 return LowerToScalableOp(Op, DAG);
14702
14703 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14704 "Unexpected vector type!");
14705
14706 // We can handle the constant cases during isel.
14707 if (isa<ConstantSDNode>(Op.getOperand(0)))
14708 return Op;
14709
14710 // There isn't a natural way to handle the general i1 case, so we use some
14711 // trickery with whilelo.
14712 SDLoc DL(Op);
14713 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14714 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14715 DAG.getValueType(MVT::i1));
14716 SDValue ID =
14717 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14718 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14719 if (VT == MVT::nxv1i1)
14720 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14721 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14722 Zero, SplatVal),
14723 Zero);
14724 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14725}
14726
14727SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14728 SelectionDAG &DAG) const {
14729 SDLoc DL(Op);
14730
14731 EVT VT = Op.getValueType();
14732 if (!isTypeLegal(VT) || !VT.isScalableVector())
14733 return SDValue();
14734
14735 // Current lowering only supports the SVE-ACLE types.
14737 return SDValue();
14738
14739 // The DUPQ operation is independent of element type so normalise to i64s.
14740 SDValue Idx128 = Op.getOperand(2);
14741
14742 // DUPQ can be used when idx is in range.
14743 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14744 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14745 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14746 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14747 }
14748
14749 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14750
14751 // The ACLE says this must produce the same result as:
14752 // svtbl(data, svadd_x(svptrue_b64(),
14753 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14754 // index * 2))
14755 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14756 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14757
14758 // create the vector 0,1,0,1,...
14759 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14760 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14761
14762 // create the vector idx64,idx64+1,idx64,idx64+1,...
14763 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14764 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14765 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14766
14767 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14768 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14769 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14770}
14771
14772
14773static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14774 APInt &UndefBits) {
14775 EVT VT = BVN->getValueType(0);
14776 APInt SplatBits, SplatUndef;
14777 unsigned SplatBitSize;
14778 bool HasAnyUndefs;
14779 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14780 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14781
14782 for (unsigned i = 0; i < NumSplats; ++i) {
14783 CnstBits <<= SplatBitSize;
14784 UndefBits <<= SplatBitSize;
14785 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14786 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14787 }
14788
14789 return true;
14790 }
14791
14792 return false;
14793}
14794
14795// Try 64-bit splatted SIMD immediate.
14796static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14797 const APInt &Bits) {
14798 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14799 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14800 EVT VT = Op.getValueType();
14801 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14802
14805
14806 SDLoc DL(Op);
14807 SDValue Mov =
14808 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14809 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14810 }
14811 }
14812
14813 return SDValue();
14814}
14815
14816// Try 32-bit splatted SIMD immediate.
14817static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14818 const APInt &Bits,
14819 const SDValue *LHS = nullptr) {
14820 EVT VT = Op.getValueType();
14821 if (VT.isFixedLengthVector() &&
14823 return SDValue();
14824
14825 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14826 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14827 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14828 bool isAdvSIMDModImm = false;
14829 uint64_t Shift;
14830
14831 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14833 Shift = 0;
14834 }
14835 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14837 Shift = 8;
14838 }
14839 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14841 Shift = 16;
14842 }
14843 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14845 Shift = 24;
14846 }
14847
14848 if (isAdvSIMDModImm) {
14849 SDLoc DL(Op);
14850 SDValue Mov;
14851
14852 if (LHS)
14853 Mov = DAG.getNode(NewOp, DL, MovTy,
14854 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14855 DAG.getConstant(Value, DL, MVT::i32),
14856 DAG.getConstant(Shift, DL, MVT::i32));
14857 else
14858 Mov =
14859 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14860 DAG.getConstant(Shift, DL, MVT::i32));
14861
14862 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14863 }
14864 }
14865
14866 return SDValue();
14867}
14868
14869// Try 16-bit splatted SIMD immediate.
14870static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14871 const APInt &Bits,
14872 const SDValue *LHS = nullptr) {
14873 EVT VT = Op.getValueType();
14874 if (VT.isFixedLengthVector() &&
14876 return SDValue();
14877
14878 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14879 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14880 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14881 bool isAdvSIMDModImm = false;
14882 uint64_t Shift;
14883
14884 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14886 Shift = 0;
14887 }
14888 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14890 Shift = 8;
14891 }
14892
14893 if (isAdvSIMDModImm) {
14894 SDLoc DL(Op);
14895 SDValue Mov;
14896
14897 if (LHS)
14898 Mov = DAG.getNode(NewOp, DL, MovTy,
14899 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14900 DAG.getConstant(Value, DL, MVT::i32),
14901 DAG.getConstant(Shift, DL, MVT::i32));
14902 else
14903 Mov =
14904 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14905 DAG.getConstant(Shift, DL, MVT::i32));
14906
14907 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14908 }
14909 }
14910
14911 return SDValue();
14912}
14913
14914// Try 32-bit splatted SIMD immediate with shifted ones.
14916 SelectionDAG &DAG, const APInt &Bits) {
14917 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14918 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14919 EVT VT = Op.getValueType();
14920 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14921 bool isAdvSIMDModImm = false;
14922 uint64_t Shift;
14923
14924 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14926 Shift = 264;
14927 }
14928 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14930 Shift = 272;
14931 }
14932
14933 if (isAdvSIMDModImm) {
14934 SDLoc DL(Op);
14935 SDValue Mov =
14936 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14937 DAG.getConstant(Shift, DL, MVT::i32));
14938 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14939 }
14940 }
14941
14942 return SDValue();
14943}
14944
14945// Try 8-bit splatted SIMD immediate.
14946static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14947 const APInt &Bits) {
14948 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14949 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14950 EVT VT = Op.getValueType();
14951 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14952
14955
14956 SDLoc DL(Op);
14957 SDValue Mov =
14958 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14959 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14960 }
14961 }
14962
14963 return SDValue();
14964}
14965
14966// Try FP splatted SIMD immediate.
14967static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14968 const APInt &Bits) {
14969 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14970 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14971 EVT VT = Op.getValueType();
14972 bool isWide = (VT.getSizeInBits() == 128);
14973 MVT MovTy;
14974 bool isAdvSIMDModImm = false;
14975
14976 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14978 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14979 }
14980 else if (isWide &&
14981 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14983 MovTy = MVT::v2f64;
14984 }
14985
14986 if (isAdvSIMDModImm) {
14987 SDLoc DL(Op);
14988 SDValue Mov =
14989 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14990 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14991 }
14992 }
14993
14994 return SDValue();
14995}
14996
14997// Specialized code to quickly find if PotentialBVec is a BuildVector that
14998// consists of only the same constant int value, returned in reference arg
14999// ConstVal
15000static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15001 uint64_t &ConstVal) {
15002 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15003 if (!Bvec)
15004 return false;
15006 if (!FirstElt)
15007 return false;
15008 EVT VT = Bvec->getValueType(0);
15009 unsigned NumElts = VT.getVectorNumElements();
15010 for (unsigned i = 1; i < NumElts; ++i)
15011 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15012 return false;
15013 ConstVal = FirstElt->getZExtValue();
15014 return true;
15015}
15016
15018 // Look through cast.
15019 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15020 N = N.getOperand(0);
15021
15022 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15023}
15024
15026 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15027
15028 // Look through cast.
15029 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15030 N = N.getOperand(0);
15031 // When reinterpreting from a type with fewer elements the "new" elements
15032 // are not active, so bail if they're likely to be used.
15033 if (N.getValueType().getVectorMinNumElements() < NumElts)
15034 return false;
15035 }
15036
15037 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15038 return true;
15039
15040 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15041 // or smaller than the implicit element type represented by N.
15042 // NOTE: A larger element count implies a smaller element type.
15043 if (N.getOpcode() == AArch64ISD::PTRUE &&
15044 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15045 return N.getValueType().getVectorMinNumElements() >= NumElts;
15046
15047 // If we're compiling for a specific vector-length, we can check if the
15048 // pattern's VL equals that of the scalable vector at runtime.
15049 if (N.getOpcode() == AArch64ISD::PTRUE) {
15050 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15051 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
15052 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
15053 if (MaxSVESize && MinSVESize == MaxSVESize) {
15054 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
15055 unsigned PatNumElts =
15056 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
15057 return PatNumElts == (NumElts * VScale);
15058 }
15059 }
15060
15061 return false;
15062}
15063
15064// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15065// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15066// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15067// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15068// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15069// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15071 EVT VT = N->getValueType(0);
15072
15073 if (!VT.isVector())
15074 return SDValue();
15075
15076 SDLoc DL(N);
15077
15078 SDValue And;
15079 SDValue Shift;
15080
15081 SDValue FirstOp = N->getOperand(0);
15082 unsigned FirstOpc = FirstOp.getOpcode();
15083 SDValue SecondOp = N->getOperand(1);
15084 unsigned SecondOpc = SecondOp.getOpcode();
15085
15086 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15087 // a BICi in order to use an immediate instead of a register.
15088 // Is the other operand an shl or lshr? This will have been turned into:
15089 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15090 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15091 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15092 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15093 SecondOpc == AArch64ISD::SHL_PRED ||
15094 SecondOpc == AArch64ISD::SRL_PRED)) {
15095 And = FirstOp;
15096 Shift = SecondOp;
15097
15098 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15099 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15100 FirstOpc == AArch64ISD::SHL_PRED ||
15101 FirstOpc == AArch64ISD::SRL_PRED)) {
15102 And = SecondOp;
15103 Shift = FirstOp;
15104 } else
15105 return SDValue();
15106
15107 bool IsAnd = And.getOpcode() == ISD::AND;
15108 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15109 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15110 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15111 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15112
15113 // Is the shift amount constant and are all lanes active?
15114 uint64_t C2;
15115 if (ShiftHasPredOp) {
15116 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15117 return SDValue();
15118 APInt C;
15120 return SDValue();
15121 C2 = C.getZExtValue();
15122 } else if (ConstantSDNode *C2node =
15124 C2 = C2node->getZExtValue();
15125 else
15126 return SDValue();
15127
15128 APInt C1AsAPInt;
15129 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15130 if (IsAnd) {
15131 // Is the and mask vector all constant?
15132 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15133 return SDValue();
15134 } else {
15135 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15136 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15137 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15138 assert(C1nodeImm && C1nodeShift);
15139 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15140 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15141 }
15142
15143 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15144 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15145 // how much one can shift elements of a particular size?
15146 if (C2 > ElemSizeInBits)
15147 return SDValue();
15148
15149 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15150 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15151 if (C1AsAPInt != RequiredC1)
15152 return SDValue();
15153
15154 SDValue X = And.getOperand(0);
15155 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15156 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15157 : Shift.getOperand(1);
15158
15159 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15160 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15161}
15162
15164 EVT VT = N->getValueType(0);
15165 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15166 SDLoc DL(N);
15167 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15168
15169 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15170 return SDValue();
15171
15172 SDValue N0 = N->getOperand(0);
15173 if (N0.getOpcode() != ISD::AND)
15174 return SDValue();
15175
15176 SDValue N1 = N->getOperand(1);
15177 if (N1.getOpcode() != ISD::AND)
15178 return SDValue();
15179
15180 // InstCombine does (not (neg a)) => (add a -1).
15181 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15182 // Loop over all combinations of AND operands.
15183 for (int i = 1; i >= 0; --i) {
15184 for (int j = 1; j >= 0; --j) {
15185 SDValue O0 = N0->getOperand(i);
15186 SDValue O1 = N1->getOperand(j);
15187 SDValue Sub, Add, SubSibling, AddSibling;
15188
15189 // Find a SUB and an ADD operand, one from each AND.
15190 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15191 Sub = O0;
15192 Add = O1;
15193 SubSibling = N0->getOperand(1 - i);
15194 AddSibling = N1->getOperand(1 - j);
15195 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15196 Add = O0;
15197 Sub = O1;
15198 AddSibling = N0->getOperand(1 - i);
15199 SubSibling = N1->getOperand(1 - j);
15200 } else
15201 continue;
15202
15203 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15204 continue;
15205
15206 // Constant ones is always righthand operand of the Add.
15207 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15208 continue;
15209
15210 if (Sub.getOperand(1) != Add.getOperand(0))
15211 continue;
15212
15213 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15214 }
15215 }
15216
15217 // (or (and a b) (and (not a) c)) => (bsl a b c)
15218 // We only have to look for constant vectors here since the general, variable
15219 // case can be handled in TableGen.
15220 unsigned Bits = VT.getScalarSizeInBits();
15221 for (int i = 1; i >= 0; --i)
15222 for (int j = 1; j >= 0; --j) {
15223 APInt Val1, Val2;
15224
15225 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15227 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15228 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15229 N0->getOperand(1 - i), N1->getOperand(1 - j));
15230 }
15233 if (!BVN0 || !BVN1)
15234 continue;
15235
15236 bool FoundMatch = true;
15237 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15240 if (!CN0 || !CN1 ||
15241 CN0->getAPIntValue().trunc(Bits) !=
15242 ~CN1->getAsAPIntVal().trunc(Bits)) {
15243 FoundMatch = false;
15244 break;
15245 }
15246 }
15247 if (FoundMatch)
15248 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15249 N0->getOperand(1 - i), N1->getOperand(1 - j));
15250 }
15251
15252 return SDValue();
15253}
15254
15255SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15256 SelectionDAG &DAG) const {
15257 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15258 !Subtarget->isNeonAvailable()))
15259 return LowerToScalableOp(Op, DAG);
15260
15261 if (SDValue Res = tryLowerToBSL(Op, DAG))
15262 return Res;
15263
15264 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15265 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15266 return Res;
15267
15268 EVT VT = Op.getValueType();
15269 if (VT.isScalableVector())
15270 return Op;
15271
15272 SDValue LHS = Op.getOperand(0);
15273 BuildVectorSDNode *BVN =
15274 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15275 if (!BVN) {
15276 // OR commutes, so try swapping the operands.
15277 LHS = Op.getOperand(1);
15278 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15279 }
15280 if (!BVN)
15281 return Op;
15282
15283 APInt DefBits(VT.getSizeInBits(), 0);
15284 APInt UndefBits(VT.getSizeInBits(), 0);
15285 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15286 SDValue NewOp;
15287
15288 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15289 DefBits, &LHS)) ||
15290 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15291 DefBits, &LHS)))
15292 return NewOp;
15293
15294 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15295 UndefBits, &LHS)) ||
15296 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15297 UndefBits, &LHS)))
15298 return NewOp;
15299 }
15300
15301 // We can always fall back to a non-immediate OR.
15302 return Op;
15303}
15304
15305// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15306// be truncated to fit element width.
15308 SelectionDAG &DAG) {
15309 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15310 SDLoc DL(Op);
15311 EVT VT = Op.getValueType();
15312 EVT EltTy= VT.getVectorElementType();
15313
15314 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15315 return Op;
15316
15318 for (SDValue Lane : Op->ops()) {
15319 // For integer vectors, type legalization would have promoted the
15320 // operands already. Otherwise, if Op is a floating-point splat
15321 // (with operands cast to integers), then the only possibilities
15322 // are constants and UNDEFs.
15323 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15324 Lane = DAG.getConstant(
15325 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15326 DL, MVT::i32);
15327 } else if (Lane.getNode()->isUndef()) {
15328 Lane = DAG.getUNDEF(MVT::i32);
15329 } else {
15330 assert(Lane.getValueType() == MVT::i32 &&
15331 "Unexpected BUILD_VECTOR operand type");
15332 }
15333 Ops.push_back(Lane);
15334 }
15335 return DAG.getBuildVector(VT, DL, Ops);
15336}
15337
15339 const AArch64Subtarget *ST, APInt &DefBits) {
15340 EVT VT = Op.getValueType();
15341 // TODO: We should be able to support 64-bit destinations too
15342 if (!ST->hasSVE() || !VT.is128BitVector() ||
15343 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15344 return SDValue();
15345
15346 // See if we can make use of the SVE dup instruction.
15347 APInt Val64 = DefBits.trunc(64);
15348 int32_t ImmVal, ShiftVal;
15349 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
15350 return SDValue();
15351
15352 SDLoc DL(Op);
15353 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15354 DAG.getConstant(Val64, DL, MVT::i64));
15355 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15356 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15357}
15358
15360 const AArch64Subtarget *ST) {
15361 EVT VT = Op.getValueType();
15362 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15363 "Expected a legal NEON vector");
15364
15365 APInt DefBits(VT.getSizeInBits(), 0);
15366 APInt UndefBits(VT.getSizeInBits(), 0);
15368 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15369 auto TryMOVIWithBits = [&](APInt DefBits) {
15370 SDValue NewOp;
15371 if ((NewOp =
15372 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15373 (NewOp =
15374 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15375 (NewOp =
15376 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15377 (NewOp =
15378 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15379 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15380 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15381 return NewOp;
15382
15383 APInt NotDefBits = ~DefBits;
15384 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15385 NotDefBits)) ||
15386 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15387 NotDefBits)) ||
15388 (NewOp =
15389 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15390 return NewOp;
15391 return SDValue();
15392 };
15393 if (SDValue R = TryMOVIWithBits(DefBits))
15394 return R;
15395 if (SDValue R = TryMOVIWithBits(UndefBits))
15396 return R;
15397
15398 // Try to materialise the constant using SVE when available.
15399 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15400 return R;
15401
15402 // See if a fneg of the constant can be materialized with a MOVI, etc
15403 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15404 // FNegate each sub-element of the constant
15405 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15406 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15407 .zext(VT.getSizeInBits());
15408 APInt NegBits(VT.getSizeInBits(), 0);
15409 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15410 for (unsigned i = 0; i < NumElts; i++)
15411 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15412 NegBits = DefBits ^ NegBits;
15413
15414 // Try to create the new constants with MOVI, and if so generate a fneg
15415 // for it.
15416 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15417 SDLoc DL(Op);
15418 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15419 return DAG.getNode(
15420 AArch64ISD::NVCAST, DL, VT,
15421 DAG.getNode(ISD::FNEG, DL, VFVT,
15422 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15423 }
15424 return SDValue();
15425 };
15426 SDValue R;
15427 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15428 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15429 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15430 return R;
15431 }
15432
15433 return SDValue();
15434}
15435
15436SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15437 SDValue Op, SelectionDAG &DAG) const {
15438 EVT VT = Op.getValueType();
15439 SDLoc DL(Op);
15440 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15441 auto *BVN = cast<BuildVectorSDNode>(Op);
15442
15443 if (auto SeqInfo = BVN->isConstantSequence()) {
15444 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15445 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15446 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15447 return convertFromScalableVector(DAG, VT, Seq);
15448 }
15449
15450 unsigned NumElems = VT.getVectorNumElements();
15451 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15452 NumElems <= 1 || BVN->isConstant())
15453 return SDValue();
15454
15455 auto IsExtractElt = [](SDValue Op) {
15456 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15457 };
15458
15459 // For integer types that are not already in vectors limit to at most four
15460 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15461 if (VT.getScalarType().isInteger() &&
15462 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15463 return SDValue();
15464
15465 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15466 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15468 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15469 return Op.isUndef() ? Undef
15470 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15471 ContainerVT, Undef, Op, ZeroI64);
15472 });
15473
15474 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15475 while (Intermediates.size() > 1) {
15476 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15477
15478 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15479 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15480 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15481 Intermediates[I / 2] =
15482 Op1.isUndef() ? Op0
15483 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15484 }
15485
15486 Intermediates.resize(Intermediates.size() / 2);
15487 ZipEC = ZipEC.divideCoefficientBy(2);
15488 }
15489
15490 assert(Intermediates.size() == 1);
15491 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15492 return convertFromScalableVector(DAG, VT, Vec);
15493}
15494
15495SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15496 SelectionDAG &DAG) const {
15497 EVT VT = Op.getValueType();
15498
15499 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15500 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15501 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15502 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15503
15504 // Try to build a simple constant vector.
15505 Op = NormalizeBuildVector(Op, DAG);
15506 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15507 // abort.
15508 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15509 return SDValue();
15510
15511 // Certain vector constants, used to express things like logical NOT and
15512 // arithmetic NEG, are passed through unmodified. This allows special
15513 // patterns for these operations to match, which will lower these constants
15514 // to whatever is proven necessary.
15515 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15516 if (BVN->isConstant()) {
15517 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15518 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15519 APInt Val(BitSize,
15520 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15521 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15522 return Op;
15523 }
15524 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15525 if (Const->isZero() && !Const->isNegative())
15526 return Op;
15527 }
15528
15529 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15530 return V;
15531
15532 // Scan through the operands to find some interesting properties we can
15533 // exploit:
15534 // 1) If only one value is used, we can use a DUP, or
15535 // 2) if only the low element is not undef, we can just insert that, or
15536 // 3) if only one constant value is used (w/ some non-constant lanes),
15537 // we can splat the constant value into the whole vector then fill
15538 // in the non-constant lanes.
15539 // 4) FIXME: If different constant values are used, but we can intelligently
15540 // select the values we'll be overwriting for the non-constant
15541 // lanes such that we can directly materialize the vector
15542 // some other way (MOVI, e.g.), we can be sneaky.
15543 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15544 SDLoc DL(Op);
15545 unsigned NumElts = VT.getVectorNumElements();
15546 bool isOnlyLowElement = true;
15547 bool usesOnlyOneValue = true;
15548 bool usesOnlyOneConstantValue = true;
15549 bool isConstant = true;
15550 bool AllLanesExtractElt = true;
15551 unsigned NumConstantLanes = 0;
15552 unsigned NumDifferentLanes = 0;
15553 unsigned NumUndefLanes = 0;
15554 SDValue Value;
15555 SDValue ConstantValue;
15556 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15557 unsigned ConsecutiveValCount = 0;
15558 SDValue PrevVal;
15559 for (unsigned i = 0; i < NumElts; ++i) {
15560 SDValue V = Op.getOperand(i);
15561 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15562 AllLanesExtractElt = false;
15563 if (V.isUndef()) {
15564 ++NumUndefLanes;
15565 continue;
15566 }
15567 if (i > 0)
15568 isOnlyLowElement = false;
15569 if (!isIntOrFPConstant(V))
15570 isConstant = false;
15571
15572 if (isIntOrFPConstant(V)) {
15573 ++NumConstantLanes;
15574 if (!ConstantValue.getNode())
15575 ConstantValue = V;
15576 else if (ConstantValue != V)
15577 usesOnlyOneConstantValue = false;
15578 }
15579
15580 if (!Value.getNode())
15581 Value = V;
15582 else if (V != Value) {
15583 usesOnlyOneValue = false;
15584 ++NumDifferentLanes;
15585 }
15586
15587 if (PrevVal != V) {
15588 ConsecutiveValCount = 0;
15589 PrevVal = V;
15590 }
15591
15592 // Keep different values and its last consecutive count. For example,
15593 //
15594 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15595 // t24, t24, t24, t24, t24, t24, t24, t24
15596 // t23 = consecutive count 8
15597 // t24 = consecutive count 8
15598 // ------------------------------------------------------------------
15599 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15600 // t24, t24, t24, t24, t24, t24, t24, t24
15601 // t23 = consecutive count 5
15602 // t24 = consecutive count 9
15603 DifferentValueMap[V] = ++ConsecutiveValCount;
15604 }
15605
15606 if (!Value.getNode()) {
15607 LLVM_DEBUG(
15608 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15609 return DAG.getUNDEF(VT);
15610 }
15611
15612 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15613 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15614 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15615 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15616 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15617 "SCALAR_TO_VECTOR node\n");
15618 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15619 }
15620
15621 if (AllLanesExtractElt) {
15622 SDNode *Vector = nullptr;
15623 bool Even = false;
15624 bool Odd = false;
15625 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15626 // the Odd pattern <1,3,5,...>.
15627 for (unsigned i = 0; i < NumElts; ++i) {
15628 SDValue V = Op.getOperand(i);
15629 const SDNode *N = V.getNode();
15630 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15631 Even = false;
15632 Odd = false;
15633 break;
15634 }
15635 SDValue N0 = N->getOperand(0);
15636
15637 // All elements are extracted from the same vector.
15638 if (!Vector) {
15639 Vector = N0.getNode();
15640 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15641 // BUILD_VECTOR.
15642 if (VT.getVectorElementType() !=
15644 break;
15645 } else if (Vector != N0.getNode()) {
15646 Odd = false;
15647 Even = false;
15648 break;
15649 }
15650
15651 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15652 // indices <1,3,5,...>.
15653 uint64_t Val = N->getConstantOperandVal(1);
15654 if (Val == 2 * i) {
15655 Even = true;
15656 continue;
15657 }
15658 if (Val - 1 == 2 * i) {
15659 Odd = true;
15660 continue;
15661 }
15662
15663 // Something does not match: abort.
15664 Odd = false;
15665 Even = false;
15666 break;
15667 }
15668 if (Even || Odd) {
15669 SDValue LHS =
15671 DAG.getConstant(0, DL, MVT::i64));
15672 SDValue RHS =
15674 DAG.getConstant(NumElts, DL, MVT::i64));
15675
15676 if (Even && !Odd)
15677 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15678 if (Odd && !Even)
15679 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15680 }
15681 }
15682
15683 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15684 // i32 and try again.
15685 if (usesOnlyOneValue) {
15686 if (!isConstant) {
15687 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15688 Value.getValueType() != VT) {
15689 LLVM_DEBUG(
15690 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15691 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15692 }
15693
15694 // This is actually a DUPLANExx operation, which keeps everything vectory.
15695
15696 SDValue Lane = Value.getOperand(1);
15697 Value = Value.getOperand(0);
15698 if (Value.getValueSizeInBits() == 64) {
15699 LLVM_DEBUG(
15700 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15701 "widening it\n");
15702 Value = WidenVector(Value, DAG);
15703 }
15704
15705 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15706 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15707 }
15708
15711 EVT EltTy = VT.getVectorElementType();
15712 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15713 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15714 LLVM_DEBUG(
15715 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15716 "BITCASTS, and try again\n");
15717 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15718 for (unsigned i = 0; i < NumElts; ++i)
15719 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15720 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15721 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15722 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15723 Val.dump(););
15724 Val = LowerBUILD_VECTOR(Val, DAG);
15725 if (Val.getNode())
15726 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15727 }
15728 }
15729
15730 // If we need to insert a small number of different non-constant elements and
15731 // the vector width is sufficiently large, prefer using DUP with the common
15732 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15733 // skip the constant lane handling below.
15734 bool PreferDUPAndInsert =
15735 !isConstant && NumDifferentLanes >= 1 &&
15736 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15737 NumDifferentLanes >= NumConstantLanes;
15738
15739 // If there was only one constant value used and for more than one lane,
15740 // start by splatting that value, then replace the non-constant lanes. This
15741 // is better than the default, which will perform a separate initialization
15742 // for each lane.
15743 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15744 // Firstly, try to materialize the splat constant.
15745 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15746 unsigned BitSize = VT.getScalarSizeInBits();
15747 APInt ConstantValueAPInt(1, 0);
15748 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15749 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15750 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15751 !ConstantValueAPInt.isAllOnes()) {
15752 Val = ConstantBuildVector(Val, DAG, Subtarget);
15753 if (!Val)
15754 // Otherwise, materialize the constant and splat it.
15755 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15756 }
15757
15758 // Now insert the non-constant lanes.
15759 for (unsigned i = 0; i < NumElts; ++i) {
15760 SDValue V = Op.getOperand(i);
15761 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15762 if (!isIntOrFPConstant(V) && !V.isUndef())
15763 // Note that type legalization likely mucked about with the VT of the
15764 // source operand, so we may have to convert it here before inserting.
15765 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15766 }
15767 return Val;
15768 }
15769
15770 // This will generate a load from the constant pool.
15771 if (isConstant) {
15772 LLVM_DEBUG(
15773 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15774 "expansion\n");
15775 return SDValue();
15776 }
15777
15778 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15779 // v4i32s. This is really a truncate, which we can construct out of (legal)
15780 // concats and truncate nodes.
15782 return M;
15783
15784 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15785 if (NumElts >= 4) {
15786 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15787 return Shuffle;
15788
15789 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15790 return Shuffle;
15791 }
15792
15793 if (PreferDUPAndInsert) {
15794 // First, build a constant vector with the common element.
15796 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15797 // Next, insert the elements that do not match the common value.
15798 for (unsigned I = 0; I < NumElts; ++I)
15799 if (Op.getOperand(I) != Value)
15800 NewVector =
15801 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15802 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15803
15804 return NewVector;
15805 }
15806
15807 // If vector consists of two different values, try to generate two DUPs and
15808 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15809 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15811 // Check the consecutive count of the value is the half number of vector
15812 // elements. In this case, we can use CONCAT_VECTORS. For example,
15813 //
15814 // canUseVECTOR_CONCAT = true;
15815 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15816 // t24, t24, t24, t24, t24, t24, t24, t24
15817 //
15818 // canUseVECTOR_CONCAT = false;
15819 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15820 // t24, t24, t24, t24, t24, t24, t24, t24
15821 bool canUseVECTOR_CONCAT = true;
15822 for (auto Pair : DifferentValueMap) {
15823 // Check different values have same length which is NumElts / 2.
15824 if (Pair.second != NumElts / 2)
15825 canUseVECTOR_CONCAT = false;
15826 Vals.push_back(Pair.first);
15827 }
15828
15829 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15830 // CONCAT_VECTORs. For example,
15831 //
15832 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15833 // t24, t24, t24, t24, t24, t24, t24, t24
15834 // ==>
15835 // t26: v8i8 = AArch64ISD::DUP t23
15836 // t28: v8i8 = AArch64ISD::DUP t24
15837 // t29: v16i8 = concat_vectors t26, t28
15838 if (canUseVECTOR_CONCAT) {
15839 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15840 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15841 SubVT.getVectorNumElements() >= 2) {
15842 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15843 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15844 SDValue DUP1 =
15845 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15846 SDValue DUP2 =
15847 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15849 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15850 return CONCAT_VECTORS;
15851 }
15852 }
15853
15854 // Let's try to generate VECTOR_SHUFFLE. For example,
15855 //
15856 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15857 // ==>
15858 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15859 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15860 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15861 if (NumElts >= 8) {
15862 SmallVector<int, 16> MaskVec;
15863 // Build mask for VECTOR_SHUFLLE.
15864 SDValue FirstLaneVal = Op.getOperand(0);
15865 for (unsigned i = 0; i < NumElts; ++i) {
15866 SDValue Val = Op.getOperand(i);
15867 if (FirstLaneVal == Val)
15868 MaskVec.push_back(i);
15869 else
15870 MaskVec.push_back(i + NumElts);
15871 }
15872
15873 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15874 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15875 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15876 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15878 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15879 return VECTOR_SHUFFLE;
15880 }
15881 }
15882
15883 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15884 // know the default expansion would otherwise fall back on something even
15885 // worse. For a vector with one or two non-undef values, that's
15886 // scalar_to_vector for the elements followed by a shuffle (provided the
15887 // shuffle is valid for the target) and materialization element by element
15888 // on the stack followed by a load for everything else.
15889 if (!isConstant && !usesOnlyOneValue) {
15890 LLVM_DEBUG(
15891 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15892 "of INSERT_VECTOR_ELT\n");
15893
15894 SDValue Vec = DAG.getUNDEF(VT);
15895 SDValue Op0 = Op.getOperand(0);
15896 unsigned i = 0;
15897
15898 // Use SCALAR_TO_VECTOR for lane zero to
15899 // a) Avoid a RMW dependency on the full vector register, and
15900 // b) Allow the register coalescer to fold away the copy if the
15901 // value is already in an S or D register, and we're forced to emit an
15902 // INSERT_SUBREG that we can't fold anywhere.
15903 //
15904 // We also allow types like i8 and i16 which are illegal scalar but legal
15905 // vector element types. After type-legalization the inserted value is
15906 // extended (i32) and it is safe to cast them to the vector type by ignoring
15907 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15908 if (!Op0.isUndef()) {
15909 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15910 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15911 ++i;
15912 }
15913 LLVM_DEBUG({
15914 if (i < NumElts)
15915 dbgs() << "Creating nodes for the other vector elements:\n";
15916 });
15917 for (; i < NumElts; ++i) {
15918 SDValue V = Op.getOperand(i);
15919 if (V.isUndef())
15920 continue;
15921 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15922 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15923 }
15924 return Vec;
15925 }
15926
15927 LLVM_DEBUG(
15928 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15929 "better alternative\n");
15930 return SDValue();
15931}
15932
15933SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15934 SelectionDAG &DAG) const {
15935 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15936 !Subtarget->isNeonAvailable()))
15937 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15938
15939 assert(Op.getValueType().isScalableVector() &&
15940 isTypeLegal(Op.getValueType()) &&
15941 "Expected legal scalable vector type!");
15942
15943 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15944 unsigned NumOperands = Op->getNumOperands();
15945 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15946 "Unexpected number of operands in CONCAT_VECTORS");
15947
15948 if (NumOperands == 2)
15949 return Op;
15950
15951 // Concat each pair of subvectors and pack into the lower half of the array.
15952 SmallVector<SDValue> ConcatOps(Op->ops());
15953 while (ConcatOps.size() > 1) {
15954 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15955 SDValue V1 = ConcatOps[I];
15956 SDValue V2 = ConcatOps[I + 1];
15957 EVT SubVT = V1.getValueType();
15958 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15959 ConcatOps[I / 2] =
15960 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15961 }
15962 ConcatOps.resize(ConcatOps.size() / 2);
15963 }
15964 return ConcatOps[0];
15965 }
15966
15967 return SDValue();
15968}
15969
15970SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15971 SelectionDAG &DAG) const {
15972 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15973
15974 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15975 !Subtarget->isNeonAvailable()))
15976 return LowerFixedLengthInsertVectorElt(Op, DAG);
15977
15978 EVT VT = Op.getOperand(0).getValueType();
15979
15980 if (VT.getScalarType() == MVT::i1) {
15981 EVT VectorVT = getPromotedVTForPredicate(VT);
15982 SDLoc DL(Op);
15983 SDValue ExtendedVector =
15984 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15985 SDValue ExtendedValue =
15986 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15987 VectorVT.getScalarType().getSizeInBits() < 32
15988 ? MVT::i32
15989 : VectorVT.getScalarType());
15990 ExtendedVector =
15991 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15992 ExtendedValue, Op.getOperand(2));
15993 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15994 }
15995
15996 // Check for non-constant or out of range lane.
15997 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15998 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15999 return SDValue();
16000
16001 return Op;
16002}
16003
16004SDValue
16005AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16006 SelectionDAG &DAG) const {
16007 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16008 EVT VT = Op.getOperand(0).getValueType();
16009
16010 if (VT.getScalarType() == MVT::i1) {
16011 // We can't directly extract from an SVE predicate; extend it first.
16012 // (This isn't the only possible lowering, but it's straightforward.)
16013 EVT VectorVT = getPromotedVTForPredicate(VT);
16014 SDLoc DL(Op);
16015 SDValue Extend =
16016 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16017 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16018 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16019 Extend, Op.getOperand(1));
16020 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16021 }
16022
16023 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16024 return LowerFixedLengthExtractVectorElt(Op, DAG);
16025
16026 // Check for non-constant or out of range lane.
16027 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16028 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16029 return SDValue();
16030
16031 // Insertion/extraction are legal for V128 types.
16032 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16033 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16034 VT == MVT::v8f16 || VT == MVT::v8bf16)
16035 return Op;
16036
16037 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16038 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16039 VT != MVT::v4bf16)
16040 return SDValue();
16041
16042 // For V64 types, we perform extraction by expanding the value
16043 // to a V128 type and perform the extraction on that.
16044 SDLoc DL(Op);
16045 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16046 EVT WideTy = WideVec.getValueType();
16047
16048 EVT ExtrTy = WideTy.getVectorElementType();
16049 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16050 ExtrTy = MVT::i32;
16051
16052 // For extractions, we just return the result directly.
16053 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16054 Op.getOperand(1));
16055}
16056
16057SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16058 SelectionDAG &DAG) const {
16059 EVT VT = Op.getValueType();
16061 "Only cases that extract a fixed length vector are supported!");
16062 EVT InVT = Op.getOperand(0).getValueType();
16063
16064 // If we don't have legal types yet, do nothing
16065 if (!isTypeLegal(InVT))
16066 return SDValue();
16067
16068 if (InVT.is128BitVector()) {
16069 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16070 unsigned Idx = Op.getConstantOperandVal(1);
16071
16072 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16073 if (Idx == 0)
16074 return Op;
16075
16076 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16077 // that directly.
16078 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16079 return Op;
16080 }
16081
16082 if (InVT.isScalableVector() ||
16083 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16084 SDLoc DL(Op);
16085 SDValue Vec = Op.getOperand(0);
16086 SDValue Idx = Op.getOperand(1);
16087
16088 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16089 if (PackedVT != InVT) {
16090 // Pack input into the bottom part of an SVE register and try again.
16091 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16092 DAG.getUNDEF(PackedVT), Vec,
16093 DAG.getVectorIdxConstant(0, DL));
16094 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16095 }
16096
16097 // This will get matched by custom code during ISelDAGToDAG.
16098 if (isNullConstant(Idx))
16099 return Op;
16100
16101 assert(InVT.isScalableVector() && "Unexpected vector type!");
16102 // Move requested subvector to the start of the vector and try again.
16103 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
16104 return convertFromScalableVector(DAG, VT, Splice);
16105 }
16106
16107 return SDValue();
16108}
16109
16110SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16111 SelectionDAG &DAG) const {
16112 assert(Op.getValueType().isScalableVector() &&
16113 "Only expect to lower inserts into scalable vectors!");
16114
16115 EVT InVT = Op.getOperand(1).getValueType();
16116 unsigned Idx = Op.getConstantOperandVal(2);
16117
16118 SDValue Vec0 = Op.getOperand(0);
16119 SDValue Vec1 = Op.getOperand(1);
16120 SDLoc DL(Op);
16121 EVT VT = Op.getValueType();
16122
16123 if (InVT.isScalableVector()) {
16124 if (!isTypeLegal(VT))
16125 return SDValue();
16126
16127 // Break down insert_subvector into simpler parts.
16128 if (VT.getVectorElementType() == MVT::i1) {
16129 unsigned NumElts = VT.getVectorMinNumElements();
16130 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16131
16132 SDValue Lo, Hi;
16133 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16134 DAG.getVectorIdxConstant(0, DL));
16135 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16136 DAG.getVectorIdxConstant(NumElts / 2, DL));
16137 if (Idx < (NumElts / 2))
16138 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16139 DAG.getVectorIdxConstant(Idx, DL));
16140 else
16141 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16142 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16143
16144 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16145 }
16146
16147 // We can select these directly.
16148 if (isTypeLegal(InVT) && Vec0.isUndef())
16149 return Op;
16150
16151 // Ensure the subvector is half the size of the main vector.
16152 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16153 return SDValue();
16154
16155 // Here narrow and wide refers to the vector element types. After "casting"
16156 // both vectors must have the same bit length and so because the subvector
16157 // has fewer elements, those elements need to be bigger.
16158 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16159 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16160
16161 // NOP cast operands to the largest legal vector of the same element count.
16162 if (VT.isFloatingPoint()) {
16163 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16164 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16165 } else {
16166 // Legal integer vectors are already their largest so Vec0 is fine as is.
16167 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16168 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16169 }
16170
16171 // To replace the top/bottom half of vector V with vector SubV we widen the
16172 // preserved half of V, concatenate this to SubV (the order depending on the
16173 // half being replaced) and then narrow the result.
16174 SDValue Narrow;
16175 if (Idx == 0) {
16176 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16177 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16178 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16179 } else {
16180 assert(Idx == InVT.getVectorMinNumElements() &&
16181 "Invalid subvector index!");
16182 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16183 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16184 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16185 }
16186
16187 return getSVESafeBitCast(VT, Narrow, DAG);
16188 }
16189
16190 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16191 // This will be matched by custom code during ISelDAGToDAG.
16192 if (Vec0.isUndef())
16193 return Op;
16194
16195 std::optional<unsigned> PredPattern =
16197 auto PredTy = VT.changeVectorElementType(MVT::i1);
16198 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16199 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16200 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16201 }
16202
16203 return SDValue();
16204}
16205
16206static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16207 if (Op.getOpcode() != AArch64ISD::DUP &&
16208 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16209 Op.getOpcode() != ISD::BUILD_VECTOR)
16210 return false;
16211
16212 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16213 !isAllConstantBuildVector(Op, SplatVal))
16214 return false;
16215
16216 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16217 !isa<ConstantSDNode>(Op->getOperand(0)))
16218 return false;
16219
16220 SplatVal = Op->getConstantOperandVal(0);
16221 if (Op.getValueType().getVectorElementType() != MVT::i64)
16222 SplatVal = (int32_t)SplatVal;
16223
16224 Negated = false;
16225 if (isPowerOf2_64(SplatVal))
16226 return true;
16227
16228 Negated = true;
16229 if (isPowerOf2_64(-SplatVal)) {
16230 SplatVal = -SplatVal;
16231 return true;
16232 }
16233
16234 return false;
16235}
16236
16237SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16238 EVT VT = Op.getValueType();
16239 SDLoc DL(Op);
16240
16241 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16242 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16243
16244 assert(VT.isScalableVector() && "Expected a scalable vector.");
16245
16246 bool Signed = Op.getOpcode() == ISD::SDIV;
16247 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16248
16249 bool Negated;
16250 uint64_t SplatVal;
16251 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
16253 SDValue Res =
16254 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16255 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16256 if (Negated)
16257 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16258
16259 return Res;
16260 }
16261
16262 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16263 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16264
16265 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16266 // operations, and truncate the result.
16267 EVT WidenedVT;
16268 if (VT == MVT::nxv16i8)
16269 WidenedVT = MVT::nxv8i16;
16270 else if (VT == MVT::nxv8i16)
16271 WidenedVT = MVT::nxv4i32;
16272 else
16273 llvm_unreachable("Unexpected Custom DIV operation");
16274
16275 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16276 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16277 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16278 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16279 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16280 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16281 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16282 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16283 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16284 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16285 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16286}
16287
16288bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16289 EVT VT, unsigned DefinedValues) const {
16290 if (!Subtarget->isNeonAvailable())
16291 return false;
16293}
16294
16296 // Currently no fixed length shuffles that require SVE are legal.
16297 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16298 return false;
16299
16300 if (VT.getVectorNumElements() == 4 &&
16301 (VT.is128BitVector() || VT.is64BitVector())) {
16302 unsigned Cost = getPerfectShuffleCost(M);
16303 if (Cost <= 1)
16304 return true;
16305 }
16306
16307 bool DummyBool;
16308 int DummyInt;
16309 unsigned DummyUnsigned;
16310
16311 unsigned EltSize = VT.getScalarSizeInBits();
16312 unsigned NumElts = VT.getVectorNumElements();
16314 isREVMask(M, EltSize, NumElts, 64) ||
16315 isREVMask(M, EltSize, NumElts, 32) ||
16316 isREVMask(M, EltSize, NumElts, 16) ||
16317 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16318 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16319 isTRNMask(M, NumElts, DummyUnsigned) ||
16320 isUZPMask(M, NumElts, DummyUnsigned) ||
16321 isZIPMask(M, NumElts, DummyUnsigned) ||
16322 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16323 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16324 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16325 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16326 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16327}
16328
16330 EVT VT) const {
16331 // Just delegate to the generic legality, clear masks aren't special.
16332 return isShuffleMaskLegal(M, VT);
16333}
16334
16335/// getVShiftImm - Check if this is a valid build_vector for the immediate
16336/// operand of a vector shift operation, where all the elements of the
16337/// build_vector must have the same constant integer value.
16338static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16339 // Ignore bit_converts.
16340 while (Op.getOpcode() == ISD::BITCAST)
16341 Op = Op.getOperand(0);
16343 APInt SplatBits, SplatUndef;
16344 unsigned SplatBitSize;
16345 bool HasAnyUndefs;
16346 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16347 HasAnyUndefs, ElementBits) ||
16348 SplatBitSize > ElementBits)
16349 return false;
16350 Cnt = SplatBits.getSExtValue();
16351 return true;
16352}
16353
16354/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16355/// operand of a vector shift left operation. That value must be in the range:
16356/// 0 <= Value < ElementBits for a left shift; or
16357/// 0 <= Value <= ElementBits for a long left shift.
16358static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16359 assert(VT.isVector() && "vector shift count is not a vector type");
16360 int64_t ElementBits = VT.getScalarSizeInBits();
16361 if (!getVShiftImm(Op, ElementBits, Cnt))
16362 return false;
16363 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16364}
16365
16366/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16367/// operand of a vector shift right operation. The value must be in the range:
16368/// 1 <= Value <= ElementBits for a right shift; or
16369static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16370 assert(VT.isVector() && "vector shift count is not a vector type");
16371 int64_t ElementBits = VT.getScalarSizeInBits();
16372 if (!getVShiftImm(Op, ElementBits, Cnt))
16373 return false;
16374 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16375}
16376
16377SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16378 SelectionDAG &DAG) const {
16379 EVT VT = Op.getValueType();
16380
16381 if (VT.getScalarType() == MVT::i1) {
16382 // Lower i1 truncate to `(x & 1) != 0`.
16383 SDLoc DL(Op);
16384 EVT OpVT = Op.getOperand(0).getValueType();
16385 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16386 SDValue One = DAG.getConstant(1, DL, OpVT);
16387 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16388 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16389 }
16390
16391 if (!VT.isVector() || VT.isScalableVector())
16392 return SDValue();
16393
16394 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16395 !Subtarget->isNeonAvailable()))
16396 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16397
16398 return SDValue();
16399}
16400
16401// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16402// possibly a truncated type, it tells how many bits of the value are to be
16403// used.
16405 SelectionDAG &DAG,
16406 unsigned &ShiftValue,
16407 SDValue &RShOperand) {
16408 if (Shift->getOpcode() != ISD::SRL)
16409 return false;
16410
16411 EVT VT = Shift.getValueType();
16412 assert(VT.isScalableVT());
16413
16414 auto ShiftOp1 =
16416 if (!ShiftOp1)
16417 return false;
16418
16419 ShiftValue = ShiftOp1->getZExtValue();
16420 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16421 return false;
16422
16423 SDValue Add = Shift->getOperand(0);
16424 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16425 return false;
16426
16428 "ResVT must be truncated or same type as the shift.");
16429 // Check if an overflow can lead to incorrect results.
16430 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16431 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16432 return false;
16433
16434 auto AddOp1 =
16436 if (!AddOp1)
16437 return false;
16438 uint64_t AddValue = AddOp1->getZExtValue();
16439 if (AddValue != 1ULL << (ShiftValue - 1))
16440 return false;
16441
16442 RShOperand = Add->getOperand(0);
16443 return true;
16444}
16445
16446SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16447 SelectionDAG &DAG) const {
16448 EVT VT = Op.getValueType();
16449 SDLoc DL(Op);
16450 int64_t Cnt;
16451
16452 if (!Op.getOperand(1).getValueType().isVector())
16453 return Op;
16454 unsigned EltSize = VT.getScalarSizeInBits();
16455
16456 switch (Op.getOpcode()) {
16457 case ISD::SHL:
16458 if (VT.isScalableVector() ||
16459 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16460 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16461
16462 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16463 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16464 DAG.getConstant(Cnt, DL, MVT::i32));
16465 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
16466 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
16467 MVT::i32),
16468 Op.getOperand(0), Op.getOperand(1));
16469 case ISD::SRA:
16470 case ISD::SRL:
16471 if (VT.isScalableVector() &&
16472 (Subtarget->hasSVE2() ||
16473 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16474 SDValue RShOperand;
16475 unsigned ShiftValue;
16476 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16477 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16478 getPredicateForVector(DAG, DL, VT), RShOperand,
16479 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16480 }
16481
16482 if (VT.isScalableVector() ||
16483 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16484 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16485 : AArch64ISD::SRL_PRED;
16486 return LowerToPredicatedOp(Op, DAG, Opc);
16487 }
16488
16489 // Right shift immediate
16490 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16491 unsigned Opc =
16492 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16493 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16494 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
16495 }
16496
16497 // Right shift register. Note, there is not a shift right register
16498 // instruction, but the shift left register instruction takes a signed
16499 // value, where negative numbers specify a right shift.
16500 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16501 : Intrinsic::aarch64_neon_ushl;
16502 // negate the shift amount
16503 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16504 Op.getOperand(1));
16505 SDValue NegShiftLeft =
16507 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16508 NegShift);
16509 return NegShiftLeft;
16510 }
16511
16512 llvm_unreachable("unexpected shift opcode");
16513}
16514
16515SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16516 SelectionDAG &DAG) const {
16517 if (Op.getValueType().isScalableVector())
16518 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16519
16520 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16521 !Subtarget->isNeonAvailable()))
16522 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16523
16524 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16525 SDValue LHS = Op.getOperand(0);
16526 SDValue RHS = Op.getOperand(1);
16527 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16528 SDLoc DL(Op);
16529
16530 if (LHS.getValueType().getVectorElementType().isInteger())
16531 return Op;
16532
16533 assert(((!Subtarget->hasFullFP16() &&
16534 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16535 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16536 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16537 "Unexpected type!");
16538
16539 // Lower isnan(x) | isnan(never-nan) to x != x.
16540 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16541 if (CC == ISD::SETUO || CC == ISD::SETO) {
16542 bool OneNaN = false;
16543 if (LHS == RHS) {
16544 OneNaN = true;
16545 } else if (DAG.isKnownNeverNaN(RHS)) {
16546 OneNaN = true;
16547 RHS = LHS;
16548 } else if (DAG.isKnownNeverNaN(LHS)) {
16549 OneNaN = true;
16550 LHS = RHS;
16551 }
16552 if (OneNaN) {
16553 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16554 }
16555 }
16556
16557 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16558 // clean. Some of them require two branches to implement.
16559 AArch64CC::CondCode CC1, CC2;
16560 bool ShouldInvert;
16561 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16562
16563 bool NoNaNs =
16564 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16565 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16566 if (!Cmp.getNode())
16567 return SDValue();
16568
16569 if (CC2 != AArch64CC::AL) {
16570 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16571 if (!Cmp2.getNode())
16572 return SDValue();
16573
16574 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16575 }
16576
16577 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16578
16579 if (ShouldInvert)
16580 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16581
16582 return Cmp;
16583}
16584
16585static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16586 SelectionDAG &DAG) {
16587 SDValue VecOp = ScalarOp.getOperand(0);
16588 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16589 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16590 DAG.getConstant(0, DL, MVT::i64));
16591}
16592
16593static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16594 SDLoc DL, SelectionDAG &DAG) {
16595 unsigned ScalarOpcode;
16596 switch (Opcode) {
16597 case ISD::VECREDUCE_AND:
16598 ScalarOpcode = ISD::AND;
16599 break;
16600 case ISD::VECREDUCE_OR:
16601 ScalarOpcode = ISD::OR;
16602 break;
16603 case ISD::VECREDUCE_XOR:
16604 ScalarOpcode = ISD::XOR;
16605 break;
16606 default:
16607 llvm_unreachable("Expected bitwise vector reduction");
16608 return SDValue();
16609 }
16610
16611 EVT VecVT = Vec.getValueType();
16612 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16613 "Expected power-of-2 length vector");
16614
16615 EVT ElemVT = VecVT.getVectorElementType();
16616
16617 SDValue Result;
16618 unsigned NumElems = VecVT.getVectorNumElements();
16619
16620 // Special case for boolean reductions
16621 if (ElemVT == MVT::i1) {
16622 // Split large vectors into smaller ones
16623 if (NumElems > 16) {
16624 SDValue Lo, Hi;
16625 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16626 EVT HalfVT = Lo.getValueType();
16627 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16628 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16629 }
16630
16631 // Results of setcc operations get widened to 128 bits if their input
16632 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16633 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16634 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16635 // size leads to the best codegen, since e.g. setcc results might need to be
16636 // truncated otherwise.
16637 unsigned ExtendedWidth = 64;
16638 if (Vec.getOpcode() == ISD::SETCC &&
16639 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16640 ExtendedWidth = 128;
16641 }
16642 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16643
16644 // any_ext doesn't work with umin/umax, so only use it for uadd.
16645 unsigned ExtendOp =
16646 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16647 SDValue Extended = DAG.getNode(
16648 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16649 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16650 // in that case we bitcast the sign extended values from v2i64 to v4i32
16651 // before reduction for optimal code generation.
16652 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16653 NumElems == 2 && ExtendedWidth == 128) {
16654 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16655 ExtendedVT = MVT::i32;
16656 }
16657 switch (ScalarOpcode) {
16658 case ISD::AND:
16659 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16660 break;
16661 case ISD::OR:
16662 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16663 break;
16664 case ISD::XOR:
16665 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16666 break;
16667 default:
16668 llvm_unreachable("Unexpected Opcode");
16669 }
16670
16671 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16672 } else {
16673 // Iteratively split the vector in half and combine using the bitwise
16674 // operation until it fits in a 64 bit register.
16675 while (VecVT.getSizeInBits() > 64) {
16676 SDValue Lo, Hi;
16677 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16678 VecVT = Lo.getValueType();
16679 NumElems = VecVT.getVectorNumElements();
16680 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16681 }
16682
16683 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16684
16685 // Do the remaining work on a scalar since it allows the code generator to
16686 // combine the shift and bitwise operation into one instruction and since
16687 // integer instructions can have higher throughput than vector instructions.
16688 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16689
16690 // Iteratively combine the lower and upper halves of the scalar using the
16691 // bitwise operation, halving the relevant region of the scalar in each
16692 // iteration, until the relevant region is just one element of the original
16693 // vector.
16694 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16695 SDValue ShiftAmount =
16696 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16697 SDValue Shifted =
16698 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16699 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16700 }
16701
16702 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16703 }
16704
16705 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16706}
16707
16708SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16709 SelectionDAG &DAG) const {
16710 SDValue Src = Op.getOperand(0);
16711 EVT SrcVT = Src.getValueType();
16712
16713 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16714 // widening by inserting zeroes.
16715 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16716 SrcVT == MVT::v2f16) {
16717 SDLoc DL(Op);
16718 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16719 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16720 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16721 }
16722
16723 // Try to lower fixed length reductions to SVE.
16724 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16725 Op.getOpcode() == ISD::VECREDUCE_AND ||
16726 Op.getOpcode() == ISD::VECREDUCE_OR ||
16727 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16728 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16729 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16730 SrcVT.getVectorElementType() == MVT::i64);
16731 if (SrcVT.isScalableVector() ||
16733 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16734
16735 if (SrcVT.getVectorElementType() == MVT::i1)
16736 return LowerPredReductionToSVE(Op, DAG);
16737
16738 switch (Op.getOpcode()) {
16739 case ISD::VECREDUCE_ADD:
16740 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16741 case ISD::VECREDUCE_AND:
16742 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16743 case ISD::VECREDUCE_OR:
16744 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16745 case ISD::VECREDUCE_SMAX:
16746 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16747 case ISD::VECREDUCE_SMIN:
16748 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16749 case ISD::VECREDUCE_UMAX:
16750 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16751 case ISD::VECREDUCE_UMIN:
16752 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16753 case ISD::VECREDUCE_XOR:
16754 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16755 case ISD::VECREDUCE_FADD:
16756 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16757 case ISD::VECREDUCE_FMAX:
16758 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16759 case ISD::VECREDUCE_FMIN:
16760 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16761 case ISD::VECREDUCE_FMAXIMUM:
16762 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16763 case ISD::VECREDUCE_FMINIMUM:
16764 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16765 default:
16766 llvm_unreachable("Unhandled fixed length reduction");
16767 }
16768 }
16769
16770 // Lower NEON reductions.
16771 SDLoc DL(Op);
16772 switch (Op.getOpcode()) {
16773 case ISD::VECREDUCE_AND:
16774 case ISD::VECREDUCE_OR:
16775 case ISD::VECREDUCE_XOR:
16776 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16777 Op.getValueType(), DL, DAG);
16778 case ISD::VECREDUCE_ADD:
16779 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16780 case ISD::VECREDUCE_SMAX:
16781 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16782 case ISD::VECREDUCE_SMIN:
16783 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16784 case ISD::VECREDUCE_UMAX:
16785 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16786 case ISD::VECREDUCE_UMIN:
16787 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16788 default:
16789 llvm_unreachable("Unhandled reduction");
16790 }
16791}
16792
16793SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16794 SelectionDAG &DAG) const {
16795 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16796 // No point replacing if we don't have the relevant instruction/libcall anyway
16797 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16798 return SDValue();
16799
16800 // LSE has an atomic load-clear instruction, but not a load-and.
16801 SDLoc DL(Op);
16802 MVT VT = Op.getSimpleValueType();
16803 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16804 SDValue RHS = Op.getOperand(2);
16805 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16806 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16807 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16808 Op.getOperand(0), Op.getOperand(1), RHS,
16809 AN->getMemOperand());
16810}
16811
16812SDValue
16813AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16814 SelectionDAG &DAG) const {
16815
16816 SDLoc DL(Op);
16817 // Get the inputs.
16818 SDNode *Node = Op.getNode();
16819 SDValue Chain = Op.getOperand(0);
16820 SDValue Size = Op.getOperand(1);
16821 MaybeAlign Align =
16822 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16823 EVT VT = Node->getValueType(0);
16824
16826 "no-stack-arg-probe")) {
16827 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16828 Chain = SP.getValue(1);
16829 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16830 if (Align)
16831 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16832 DAG.getSignedConstant(-Align->value(), DL, VT));
16833 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16834 SDValue Ops[2] = {SP, Chain};
16835 return DAG.getMergeValues(Ops, DL);
16836 }
16837
16838 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16839
16840 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16841 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
16842 PtrVT, 0);
16843
16844 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16845 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16846 if (Subtarget->hasCustomCallingConv())
16847 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16848
16849 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16850 DAG.getConstant(4, DL, MVT::i64));
16851 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16852 Chain =
16853 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16854 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16855 DAG.getRegisterMask(Mask), Chain.getValue(1));
16856 // To match the actual intent better, we should read the output from X15 here
16857 // again (instead of potentially spilling it to the stack), but rereading Size
16858 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16859 // here.
16860
16861 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16862 DAG.getConstant(4, DL, MVT::i64));
16863
16864 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16865 Chain = SP.getValue(1);
16866 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16867 if (Align)
16868 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16869 DAG.getSignedConstant(-Align->value(), DL, VT));
16870 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16871
16872 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16873
16874 SDValue Ops[2] = {SP, Chain};
16875 return DAG.getMergeValues(Ops, DL);
16876}
16877
16878SDValue
16879AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16880 SelectionDAG &DAG) const {
16881 // Get the inputs.
16882 SDNode *Node = Op.getNode();
16883 SDValue Chain = Op.getOperand(0);
16884 SDValue Size = Op.getOperand(1);
16885
16886 MaybeAlign Align =
16887 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16888 SDLoc DL(Op);
16889 EVT VT = Node->getValueType(0);
16890
16891 // Construct the new SP value in a GPR.
16892 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16893 Chain = SP.getValue(1);
16894 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16895 if (Align)
16896 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16897 DAG.getSignedConstant(-Align->value(), DL, VT));
16898
16899 // Set the real SP to the new value with a probing loop.
16900 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16901 SDValue Ops[2] = {SP, Chain};
16902 return DAG.getMergeValues(Ops, DL);
16903}
16904
16905SDValue
16906AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16907 SelectionDAG &DAG) const {
16908 MachineFunction &MF = DAG.getMachineFunction();
16909
16910 if (Subtarget->isTargetWindows())
16911 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16912 else if (hasInlineStackProbe(MF))
16913 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16914 else
16915 return SDValue();
16916}
16917
16918SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16919 unsigned NewOp) const {
16920 if (Subtarget->hasSVE2())
16921 return LowerToPredicatedOp(Op, DAG, NewOp);
16922
16923 // Default to expand.
16924 return SDValue();
16925}
16926
16927SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16928 SelectionDAG &DAG) const {
16929 EVT VT = Op.getValueType();
16930 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16931
16932 SDLoc DL(Op);
16933 APInt MulImm = Op.getConstantOperandAPInt(0);
16934 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16935 VT);
16936}
16937
16938/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16939template <unsigned NumVecs>
16940static bool
16944 // Retrieve EC from first vector argument.
16945 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16947#ifndef NDEBUG
16948 // Check the assumption that all input vectors are the same type.
16949 for (unsigned I = 0; I < NumVecs; ++I)
16950 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16951 "Invalid type.");
16952#endif
16953 // memVT is `NumVecs * VT`.
16955 EC * NumVecs);
16956 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16957 Info.offset = 0;
16958 Info.align.reset();
16960 return true;
16961}
16962
16963/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16964/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16965/// specified in the intrinsic calls.
16967 const CallInst &I,
16968 MachineFunction &MF,
16969 unsigned Intrinsic) const {
16970 auto &DL = I.getDataLayout();
16971 switch (Intrinsic) {
16972 case Intrinsic::aarch64_sve_st2:
16973 return setInfoSVEStN<2>(*this, DL, Info, I);
16974 case Intrinsic::aarch64_sve_st3:
16975 return setInfoSVEStN<3>(*this, DL, Info, I);
16976 case Intrinsic::aarch64_sve_st4:
16977 return setInfoSVEStN<4>(*this, DL, Info, I);
16978 case Intrinsic::aarch64_neon_ld2:
16979 case Intrinsic::aarch64_neon_ld3:
16980 case Intrinsic::aarch64_neon_ld4:
16981 case Intrinsic::aarch64_neon_ld1x2:
16982 case Intrinsic::aarch64_neon_ld1x3:
16983 case Intrinsic::aarch64_neon_ld1x4: {
16984 Info.opc = ISD::INTRINSIC_W_CHAIN;
16985 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16986 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16987 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16988 Info.offset = 0;
16989 Info.align.reset();
16990 // volatile loads with NEON intrinsics not supported
16991 Info.flags = MachineMemOperand::MOLoad;
16992 return true;
16993 }
16994 case Intrinsic::aarch64_neon_ld2lane:
16995 case Intrinsic::aarch64_neon_ld3lane:
16996 case Intrinsic::aarch64_neon_ld4lane:
16997 case Intrinsic::aarch64_neon_ld2r:
16998 case Intrinsic::aarch64_neon_ld3r:
16999 case Intrinsic::aarch64_neon_ld4r: {
17000 Info.opc = ISD::INTRINSIC_W_CHAIN;
17001 // ldx return struct with the same vec type
17002 Type *RetTy = I.getType();
17003 auto *StructTy = cast<StructType>(RetTy);
17004 unsigned NumElts = StructTy->getNumElements();
17005 Type *VecTy = StructTy->getElementType(0);
17006 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17007 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17008 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17009 Info.offset = 0;
17010 Info.align.reset();
17011 // volatile loads with NEON intrinsics not supported
17012 Info.flags = MachineMemOperand::MOLoad;
17013 return true;
17014 }
17015 case Intrinsic::aarch64_neon_st2:
17016 case Intrinsic::aarch64_neon_st3:
17017 case Intrinsic::aarch64_neon_st4:
17018 case Intrinsic::aarch64_neon_st1x2:
17019 case Intrinsic::aarch64_neon_st1x3:
17020 case Intrinsic::aarch64_neon_st1x4: {
17021 Info.opc = ISD::INTRINSIC_VOID;
17022 unsigned NumElts = 0;
17023 for (const Value *Arg : I.args()) {
17024 Type *ArgTy = Arg->getType();
17025 if (!ArgTy->isVectorTy())
17026 break;
17027 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17028 }
17029 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17030 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17031 Info.offset = 0;
17032 Info.align.reset();
17033 // volatile stores with NEON intrinsics not supported
17034 Info.flags = MachineMemOperand::MOStore;
17035 return true;
17036 }
17037 case Intrinsic::aarch64_neon_st2lane:
17038 case Intrinsic::aarch64_neon_st3lane:
17039 case Intrinsic::aarch64_neon_st4lane: {
17040 Info.opc = ISD::INTRINSIC_VOID;
17041 unsigned NumElts = 0;
17042 // all the vector type is same
17043 Type *VecTy = I.getArgOperand(0)->getType();
17044 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17045
17046 for (const Value *Arg : I.args()) {
17047 Type *ArgTy = Arg->getType();
17048 if (!ArgTy->isVectorTy())
17049 break;
17050 NumElts += 1;
17051 }
17052
17053 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17054 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17055 Info.offset = 0;
17056 Info.align.reset();
17057 // volatile stores with NEON intrinsics not supported
17058 Info.flags = MachineMemOperand::MOStore;
17059 return true;
17060 }
17061 case Intrinsic::aarch64_ldaxr:
17062 case Intrinsic::aarch64_ldxr: {
17063 Type *ValTy = I.getParamElementType(0);
17064 Info.opc = ISD::INTRINSIC_W_CHAIN;
17065 Info.memVT = MVT::getVT(ValTy);
17066 Info.ptrVal = I.getArgOperand(0);
17067 Info.offset = 0;
17068 Info.align = DL.getABITypeAlign(ValTy);
17070 return true;
17071 }
17072 case Intrinsic::aarch64_stlxr:
17073 case Intrinsic::aarch64_stxr: {
17074 Type *ValTy = I.getParamElementType(1);
17075 Info.opc = ISD::INTRINSIC_W_CHAIN;
17076 Info.memVT = MVT::getVT(ValTy);
17077 Info.ptrVal = I.getArgOperand(1);
17078 Info.offset = 0;
17079 Info.align = DL.getABITypeAlign(ValTy);
17081 return true;
17082 }
17083 case Intrinsic::aarch64_ldaxp:
17084 case Intrinsic::aarch64_ldxp:
17085 Info.opc = ISD::INTRINSIC_W_CHAIN;
17086 Info.memVT = MVT::i128;
17087 Info.ptrVal = I.getArgOperand(0);
17088 Info.offset = 0;
17089 Info.align = Align(16);
17091 return true;
17092 case Intrinsic::aarch64_stlxp:
17093 case Intrinsic::aarch64_stxp:
17094 Info.opc = ISD::INTRINSIC_W_CHAIN;
17095 Info.memVT = MVT::i128;
17096 Info.ptrVal = I.getArgOperand(2);
17097 Info.offset = 0;
17098 Info.align = Align(16);
17100 return true;
17101 case Intrinsic::aarch64_sve_ldnt1: {
17102 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17103 Info.opc = ISD::INTRINSIC_W_CHAIN;
17104 Info.memVT = MVT::getVT(I.getType());
17105 Info.ptrVal = I.getArgOperand(1);
17106 Info.offset = 0;
17107 Info.align = DL.getABITypeAlign(ElTy);
17109 return true;
17110 }
17111 case Intrinsic::aarch64_sve_stnt1: {
17112 Type *ElTy =
17113 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17114 Info.opc = ISD::INTRINSIC_W_CHAIN;
17115 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17116 Info.ptrVal = I.getArgOperand(2);
17117 Info.offset = 0;
17118 Info.align = DL.getABITypeAlign(ElTy);
17120 return true;
17121 }
17122 case Intrinsic::aarch64_mops_memset_tag: {
17123 Value *Dst = I.getArgOperand(0);
17124 Value *Val = I.getArgOperand(1);
17125 Info.opc = ISD::INTRINSIC_W_CHAIN;
17126 Info.memVT = MVT::getVT(Val->getType());
17127 Info.ptrVal = Dst;
17128 Info.offset = 0;
17129 Info.align = I.getParamAlign(0).valueOrOne();
17130 Info.flags = MachineMemOperand::MOStore;
17131 // The size of the memory being operated on is unknown at this point
17132 Info.size = MemoryLocation::UnknownSize;
17133 return true;
17134 }
17135 default:
17136 break;
17137 }
17138
17139 return false;
17140}
17141
17143 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17144 std::optional<unsigned> ByteOffset) const {
17145 // TODO: This may be worth removing. Check regression tests for diffs.
17146 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17147 ByteOffset))
17148 return false;
17149
17150 // If we're reducing the load width in order to avoid having to use an extra
17151 // instruction to do extension then it's probably a good idea.
17152 if (ExtTy != ISD::NON_EXTLOAD)
17153 return true;
17154 // Don't reduce load width if it would prevent us from combining a shift into
17155 // the offset.
17156 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17157 assert(Mem);
17158 const SDValue &Base = Mem->getBasePtr();
17159 if (Base.getOpcode() == ISD::ADD &&
17160 Base.getOperand(1).getOpcode() == ISD::SHL &&
17161 Base.getOperand(1).hasOneUse() &&
17162 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17163 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17164 if (Mem->getMemoryVT().isScalableVector())
17165 return false;
17166 // The shift can be combined if it matches the size of the value being
17167 // loaded (and so reducing the width would make it not match).
17168 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17169 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17170 if (ShiftAmount == Log2_32(LoadBytes))
17171 return false;
17172 }
17173 // We have no reason to disallow reducing the load width, so allow it.
17174 return true;
17175}
17176
17177// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17179 EVT VT = Extend.getValueType();
17180 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17181 SDValue Extract = Extend.getOperand(0);
17182 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17183 Extract = Extract.getOperand(0);
17184 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17185 EVT VecVT = Extract.getOperand(0).getValueType();
17186 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17187 return false;
17188 }
17189 }
17190 return true;
17191}
17192
17193// Truncations from 64-bit GPR to 32-bit GPR is free.
17195 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17196 return false;
17197 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17198 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17199 return NumBits1 > NumBits2;
17200}
17202 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17203 return false;
17204 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17205 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17206 return NumBits1 > NumBits2;
17207}
17208
17209/// Check if it is profitable to hoist instruction in then/else to if.
17210/// Not profitable if I and it's user can form a FMA instruction
17211/// because we prefer FMSUB/FMADD.
17213 if (I->getOpcode() != Instruction::FMul)
17214 return true;
17215
17216 if (!I->hasOneUse())
17217 return true;
17218
17219 Instruction *User = I->user_back();
17220
17221 if (!(User->getOpcode() == Instruction::FSub ||
17222 User->getOpcode() == Instruction::FAdd))
17223 return true;
17224
17226 const Function *F = I->getFunction();
17227 const DataLayout &DL = F->getDataLayout();
17228 Type *Ty = User->getOperand(0)->getType();
17229
17230 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17232 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17233 I->getFastMathFlags().allowContract()));
17234}
17235
17236// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17237// 64-bit GPR.
17239 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17240 return false;
17241 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17242 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17243 return NumBits1 == 32 && NumBits2 == 64;
17244}
17246 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17247 return false;
17248 unsigned NumBits1 = VT1.getSizeInBits();
17249 unsigned NumBits2 = VT2.getSizeInBits();
17250 return NumBits1 == 32 && NumBits2 == 64;
17251}
17252
17254 EVT VT1 = Val.getValueType();
17255 if (isZExtFree(VT1, VT2)) {
17256 return true;
17257 }
17258
17259 if (Val.getOpcode() != ISD::LOAD)
17260 return false;
17261
17262 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17263 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17264 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17265 VT1.getSizeInBits() <= 32);
17266}
17267
17268bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17269 if (isa<FPExtInst>(Ext))
17270 return false;
17271
17272 // Vector types are not free.
17273 if (Ext->getType()->isVectorTy())
17274 return false;
17275
17276 for (const Use &U : Ext->uses()) {
17277 // The extension is free if we can fold it with a left shift in an
17278 // addressing mode or an arithmetic operation: add, sub, and cmp.
17279
17280 // Is there a shift?
17281 const Instruction *Instr = cast<Instruction>(U.getUser());
17282
17283 // Is this a constant shift?
17284 switch (Instr->getOpcode()) {
17285 case Instruction::Shl:
17286 if (!isa<ConstantInt>(Instr->getOperand(1)))
17287 return false;
17288 break;
17289 case Instruction::GetElementPtr: {
17290 gep_type_iterator GTI = gep_type_begin(Instr);
17291 auto &DL = Ext->getDataLayout();
17292 std::advance(GTI, U.getOperandNo()-1);
17293 Type *IdxTy = GTI.getIndexedType();
17294 // This extension will end up with a shift because of the scaling factor.
17295 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17296 // Get the shift amount based on the scaling factor:
17297 // log2(sizeof(IdxTy)) - log2(8).
17298 if (IdxTy->isScalableTy())
17299 return false;
17300 uint64_t ShiftAmt =
17301 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17302 3;
17303 // Is the constant foldable in the shift of the addressing mode?
17304 // I.e., shift amount is between 1 and 4 inclusive.
17305 if (ShiftAmt == 0 || ShiftAmt > 4)
17306 return false;
17307 break;
17308 }
17309 case Instruction::Trunc:
17310 // Check if this is a noop.
17311 // trunc(sext ty1 to ty2) to ty1.
17312 if (Instr->getType() == Ext->getOperand(0)->getType())
17313 continue;
17314 [[fallthrough]];
17315 default:
17316 return false;
17317 }
17318
17319 // At this point we can use the bfm family, so this extension is free
17320 // for that use.
17321 }
17322 return true;
17323}
17324
17325static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17326 unsigned NumElts, bool IsLittleEndian,
17327 SmallVectorImpl<int> &Mask) {
17328 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17329 return false;
17330
17331 assert(DstWidth % SrcWidth == 0 &&
17332 "TBL lowering is not supported for a conversion instruction with this "
17333 "source and destination element type.");
17334
17335 unsigned Factor = DstWidth / SrcWidth;
17336 unsigned MaskLen = NumElts * Factor;
17337
17338 Mask.clear();
17339 Mask.resize(MaskLen, NumElts);
17340
17341 unsigned SrcIndex = 0;
17342 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17343 Mask[I] = SrcIndex++;
17344
17345 return true;
17346}
17347
17349 FixedVectorType *ZExtTy,
17350 FixedVectorType *DstTy,
17351 bool IsLittleEndian) {
17352 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17353 unsigned NumElts = SrcTy->getNumElements();
17354 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17355 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17356
17357 SmallVector<int> Mask;
17358 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17359 return nullptr;
17360
17361 auto *FirstEltZero = Builder.CreateInsertElement(
17362 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17363 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17364 Result = Builder.CreateBitCast(Result, DstTy);
17365 if (DstTy != ZExtTy)
17366 Result = Builder.CreateZExt(Result, ZExtTy);
17367 return Result;
17368}
17369
17371 FixedVectorType *DstTy,
17372 bool IsLittleEndian) {
17373 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17374 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17375 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17376
17377 SmallVector<int> Mask;
17378 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17379 !IsLittleEndian, Mask))
17380 return nullptr;
17381
17382 auto *FirstEltZero = Builder.CreateInsertElement(
17383 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17384
17385 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17386}
17387
17388static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17389 IRBuilder<> Builder(TI);
17391 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17392 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17393 auto *DstTy = cast<FixedVectorType>(TI->getType());
17394 assert(SrcTy->getElementType()->isIntegerTy() &&
17395 "Non-integer type source vector element is not supported");
17396 assert(DstTy->getElementType()->isIntegerTy(8) &&
17397 "Unsupported destination vector element type");
17398 unsigned SrcElemTySz =
17399 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17400 unsigned DstElemTySz =
17401 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17402 assert((SrcElemTySz % DstElemTySz == 0) &&
17403 "Cannot lower truncate to tbl instructions for a source element size "
17404 "that is not divisible by the destination element size");
17405 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17406 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17407 "Unsupported source vector element type size");
17408 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17409
17410 // Create a mask to choose every nth byte from the source vector table of
17411 // bytes to create the truncated destination vector, where 'n' is the truncate
17412 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17413 // 0,8,16,..Y*8th bytes for the little-endian format
17415 for (int Itr = 0; Itr < 16; Itr++) {
17416 if (Itr < NumElements)
17417 MaskConst.push_back(Builder.getInt8(
17418 IsLittleEndian ? Itr * TruncFactor
17419 : Itr * TruncFactor + (TruncFactor - 1)));
17420 else
17421 MaskConst.push_back(Builder.getInt8(255));
17422 }
17423
17424 int MaxTblSz = 128 * 4;
17425 int MaxSrcSz = SrcElemTySz * NumElements;
17426 int ElemsPerTbl =
17427 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17428 assert(ElemsPerTbl <= 16 &&
17429 "Maximum elements selected using TBL instruction cannot exceed 16!");
17430
17431 int ShuffleCount = 128 / SrcElemTySz;
17432 SmallVector<int> ShuffleLanes;
17433 for (int i = 0; i < ShuffleCount; ++i)
17434 ShuffleLanes.push_back(i);
17435
17436 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17437 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17438 // call TBL & save the result in a vector of TBL results for combining later.
17440 while (ShuffleLanes.back() < NumElements) {
17441 Parts.push_back(Builder.CreateBitCast(
17442 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17443
17444 if (Parts.size() == 4) {
17445 Parts.push_back(ConstantVector::get(MaskConst));
17446 Results.push_back(
17447 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17448 Parts.clear();
17449 }
17450
17451 for (int i = 0; i < ShuffleCount; ++i)
17452 ShuffleLanes[i] += ShuffleCount;
17453 }
17454
17455 assert((Parts.empty() || Results.empty()) &&
17456 "Lowering trunc for vectors requiring different TBL instructions is "
17457 "not supported!");
17458 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17459 // registers
17460 if (!Parts.empty()) {
17461 Intrinsic::ID TblID;
17462 switch (Parts.size()) {
17463 case 1:
17464 TblID = Intrinsic::aarch64_neon_tbl1;
17465 break;
17466 case 2:
17467 TblID = Intrinsic::aarch64_neon_tbl2;
17468 break;
17469 case 3:
17470 TblID = Intrinsic::aarch64_neon_tbl3;
17471 break;
17472 }
17473
17474 Parts.push_back(ConstantVector::get(MaskConst));
17475 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17476 }
17477
17478 // Extract the destination vector from TBL result(s) after combining them
17479 // where applicable. Currently, at most two TBLs are supported.
17480 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17481 "more than 2 tbl instructions!");
17482 Value *FinalResult = Results[0];
17483 if (Results.size() == 1) {
17484 if (ElemsPerTbl < 16) {
17485 SmallVector<int> FinalMask(ElemsPerTbl);
17486 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17487 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17488 }
17489 } else {
17490 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17491 if (ElemsPerTbl < 16) {
17492 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17493 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17494 } else {
17495 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17496 }
17497 FinalResult =
17498 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17499 }
17500
17501 TI->replaceAllUsesWith(FinalResult);
17502 TI->eraseFromParent();
17503}
17504
17506 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17507 // shuffle_vector instructions are serialized when targeting SVE,
17508 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17509 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17510 return false;
17511
17512 // Try to optimize conversions using tbl. This requires materializing constant
17513 // index vectors, which can increase code size and add loads. Skip the
17514 // transform unless the conversion is in a loop block guaranteed to execute
17515 // and we are not optimizing for size.
17516 Function *F = I->getParent()->getParent();
17517 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17518 return false;
17519
17520 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17521 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17522 if (!SrcTy || !DstTy)
17523 return false;
17524
17525 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17526 // lowered to tbl instructions to insert the original i8 elements
17527 // into i8x lanes. This is enabled for cases where it is beneficial.
17528 auto *ZExt = dyn_cast<ZExtInst>(I);
17529 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17530 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17531 if (DstWidth % 8 != 0)
17532 return false;
17533
17534 auto *TruncDstType =
17536 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17537 // the remaining ZExt folded into the user, don't use tbl lowering.
17538 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17539 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17542 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17543 return false;
17544
17545 DstTy = TruncDstType;
17546 }
17547
17548 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17549 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17550 // most one extra extend step is needed and using tbl is not profitable.
17551 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17552 // udot instruction.
17553 if (SrcWidth * 4 <= DstWidth) {
17554 if (all_of(I->users(), [&](auto *U) {
17555 auto *SingleUser = cast<Instruction>(&*U);
17556 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17557 return true;
17558 if (match(SingleUser,
17559 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
17560 m_Value(), m_Specific(I))))
17561 return true;
17562 return false;
17563 }))
17564 return false;
17565 }
17566
17567 if (DstTy->getScalarSizeInBits() >= 64)
17568 return false;
17569
17570 IRBuilder<> Builder(ZExt);
17572 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17573 DstTy, Subtarget->isLittleEndian());
17574 if (!Result)
17575 return false;
17576 ZExt->replaceAllUsesWith(Result);
17577 ZExt->eraseFromParent();
17578 return true;
17579 }
17580
17581 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17582 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17583 DstTy->getElementType()->isFloatTy()) ||
17584 (SrcTy->getElementType()->isIntegerTy(16) &&
17585 DstTy->getElementType()->isDoubleTy()))) {
17586 IRBuilder<> Builder(I);
17588 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17589 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17590 assert(ZExt && "Cannot fail for the i8 to float conversion");
17591 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17592 I->replaceAllUsesWith(UI);
17593 I->eraseFromParent();
17594 return true;
17595 }
17596
17597 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17598 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17599 DstTy->getElementType()->isFloatTy()) {
17600 IRBuilder<> Builder(I);
17601 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17603 Subtarget->isLittleEndian());
17604 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17605 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17606 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17607 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17608 I->replaceAllUsesWith(SI);
17609 I->eraseFromParent();
17610 return true;
17611 }
17612
17613 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17614 // followed by a truncate lowered to using tbl.4.
17615 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17616 if (FPToUI &&
17617 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17618 SrcTy->getElementType()->isFloatTy() &&
17619 DstTy->getElementType()->isIntegerTy(8)) {
17620 IRBuilder<> Builder(I);
17621 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17622 VectorType::getInteger(SrcTy));
17623 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17624 I->replaceAllUsesWith(TruncI);
17625 I->eraseFromParent();
17626 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17627 return true;
17628 }
17629
17630 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17631 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17632 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17633 // registers
17634 auto *TI = dyn_cast<TruncInst>(I);
17635 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17636 ((SrcTy->getElementType()->isIntegerTy(32) ||
17637 SrcTy->getElementType()->isIntegerTy(64)) &&
17638 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17639 createTblForTrunc(TI, Subtarget->isLittleEndian());
17640 return true;
17641 }
17642
17643 return false;
17644}
17645
17647 Align &RequiredAlignment) const {
17648 if (!LoadedType.isSimple() ||
17649 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17650 return false;
17651 // Cyclone supports unaligned accesses.
17652 RequiredAlignment = Align(1);
17653 unsigned NumBits = LoadedType.getSizeInBits();
17654 return NumBits == 32 || NumBits == 64;
17655}
17656
17657/// A helper function for determining the number of interleaved accesses we
17658/// will generate when lowering accesses of the given type.
17660 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17661 unsigned VecSize = 128;
17662 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17663 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17664 if (UseScalable && isa<FixedVectorType>(VecTy))
17665 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17666 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17667}
17668
17671 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17672 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17673 return MOStridedAccess;
17675}
17676
17678 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17679 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17680 auto EC = VecTy->getElementCount();
17681 unsigned MinElts = EC.getKnownMinValue();
17682
17683 UseScalable = false;
17684
17685 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17686 (!Subtarget->useSVEForFixedLengthVectors() ||
17688 return false;
17689
17690 if (isa<ScalableVectorType>(VecTy) &&
17691 !Subtarget->isSVEorStreamingSVEAvailable())
17692 return false;
17693
17694 // Ensure the number of vector elements is greater than 1.
17695 if (MinElts < 2)
17696 return false;
17697
17698 // Ensure the element type is legal.
17699 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17700 return false;
17701
17702 if (EC.isScalable()) {
17703 UseScalable = true;
17704 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17705 }
17706
17707 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17708 if (Subtarget->useSVEForFixedLengthVectors()) {
17709 unsigned MinSVEVectorSize =
17710 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17711 if (VecSize % MinSVEVectorSize == 0 ||
17712 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17713 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17714 UseScalable = true;
17715 return true;
17716 }
17717 }
17718
17719 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17720 // 128 will be split into multiple interleaved accesses.
17721 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17722}
17723
17725 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17726 return ScalableVectorType::get(VTy->getElementType(), 2);
17727
17728 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17729 return ScalableVectorType::get(VTy->getElementType(), 4);
17730
17731 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17732 return ScalableVectorType::get(VTy->getElementType(), 8);
17733
17734 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17735 return ScalableVectorType::get(VTy->getElementType(), 8);
17736
17737 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17738 return ScalableVectorType::get(VTy->getElementType(), 2);
17739
17740 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17741 return ScalableVectorType::get(VTy->getElementType(), 4);
17742
17743 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17744 return ScalableVectorType::get(VTy->getElementType(), 8);
17745
17746 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17747 return ScalableVectorType::get(VTy->getElementType(), 16);
17748
17749 llvm_unreachable("Cannot handle input vector type");
17750}
17751
17752static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17753 bool Scalable, Type *LDVTy,
17754 Type *PtrTy) {
17755 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17756 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17757 Intrinsic::aarch64_sve_ld3_sret,
17758 Intrinsic::aarch64_sve_ld4_sret};
17759 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17760 Intrinsic::aarch64_neon_ld3,
17761 Intrinsic::aarch64_neon_ld4};
17762 if (Scalable)
17763 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17764
17765 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17766 {LDVTy, PtrTy});
17767}
17768
17769static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17770 bool Scalable, Type *STVTy,
17771 Type *PtrTy) {
17772 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17773 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17774 Intrinsic::aarch64_sve_st3,
17775 Intrinsic::aarch64_sve_st4};
17776 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17777 Intrinsic::aarch64_neon_st3,
17778 Intrinsic::aarch64_neon_st4};
17779 if (Scalable)
17780 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17781
17782 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17783 {STVTy, PtrTy});
17784}
17785
17786/// Lower an interleaved load into a ldN intrinsic.
17787///
17788/// E.g. Lower an interleaved load (Factor = 2):
17789/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17790/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17791/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17792///
17793/// Into:
17794/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17795/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17796/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17798 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17799 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17800 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17801 "Invalid interleave factor");
17802 assert(!Shuffles.empty() && "Empty shufflevector input");
17803 assert(Shuffles.size() == Indices.size() &&
17804 "Unmatched number of shufflevectors and indices");
17805
17806 auto *LI = dyn_cast<LoadInst>(Load);
17807 if (!LI)
17808 return false;
17809 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17810
17811 const DataLayout &DL = LI->getDataLayout();
17812
17813 VectorType *VTy = Shuffles[0]->getType();
17814
17815 // Skip if we do not have NEON and skip illegal vector types. We can
17816 // "legalize" wide vector types into multiple interleaved accesses as long as
17817 // the vector types are divisible by 128.
17818 bool UseScalable;
17819 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17820 return false;
17821
17822 // Check if the interleave is a zext(shuffle), that can be better optimized
17823 // into shift / and masks. For the moment we do this just for uitofp (not
17824 // zext) to avoid issues with widening instructions.
17825 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17826 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17827 SI->getType()->getScalarSizeInBits() * 4 ==
17828 SI->user_back()->getType()->getScalarSizeInBits();
17829 }))
17830 return false;
17831
17832 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17833
17834 auto *FVTy = cast<FixedVectorType>(VTy);
17835
17836 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17837 // load integer vectors first and then convert to pointer vectors.
17838 Type *EltTy = FVTy->getElementType();
17839 if (EltTy->isPointerTy())
17840 FVTy =
17841 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17842
17843 // If we're going to generate more than one load, reset the sub-vector type
17844 // to something legal.
17845 FVTy = FixedVectorType::get(FVTy->getElementType(),
17846 FVTy->getNumElements() / NumLoads);
17847
17848 auto *LDVTy =
17849 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17850
17851 IRBuilder<> Builder(LI);
17852
17853 // The base address of the load.
17854 Value *BaseAddr = LI->getPointerOperand();
17855
17856 Type *PtrTy = LI->getPointerOperandType();
17857 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17858 LDVTy->getElementCount());
17859
17860 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17861 UseScalable, LDVTy, PtrTy);
17862
17863 // Holds sub-vectors extracted from the load intrinsic return values. The
17864 // sub-vectors are associated with the shufflevector instructions they will
17865 // replace.
17867
17868 Value *PTrue = nullptr;
17869 if (UseScalable) {
17870 std::optional<unsigned> PgPattern =
17871 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17872 if (Subtarget->getMinSVEVectorSizeInBits() ==
17873 Subtarget->getMaxSVEVectorSizeInBits() &&
17874 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17875 PgPattern = AArch64SVEPredPattern::all;
17876
17877 auto *PTruePat =
17878 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17879 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17880 {PTruePat});
17881 }
17882
17883 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17884
17885 // If we're generating more than one load, compute the base address of
17886 // subsequent loads as an offset from the previous.
17887 if (LoadCount > 0)
17888 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17889 FVTy->getNumElements() * Factor);
17890
17891 CallInst *LdN;
17892 if (UseScalable)
17893 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17894 else
17895 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17896
17897 // Extract and store the sub-vectors returned by the load intrinsic.
17898 for (unsigned i = 0; i < Shuffles.size(); i++) {
17899 ShuffleVectorInst *SVI = Shuffles[i];
17900 unsigned Index = Indices[i];
17901
17902 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17903
17904 if (UseScalable)
17905 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17906
17907 // Convert the integer vector to pointer vector if the element is pointer.
17908 if (EltTy->isPointerTy())
17909 SubVec = Builder.CreateIntToPtr(
17911 FVTy->getNumElements()));
17912
17913 SubVecs[SVI].push_back(SubVec);
17914 }
17915 }
17916
17917 // Replace uses of the shufflevector instructions with the sub-vectors
17918 // returned by the load intrinsic. If a shufflevector instruction is
17919 // associated with more than one sub-vector, those sub-vectors will be
17920 // concatenated into a single wide vector.
17921 for (ShuffleVectorInst *SVI : Shuffles) {
17922 auto &SubVec = SubVecs[SVI];
17923 auto *WideVec =
17924 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17925 SVI->replaceAllUsesWith(WideVec);
17926 }
17927
17928 return true;
17929}
17930
17931template <typename Iter>
17932bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17933 int MaxLookupDist = 20;
17934 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17935 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17936 const Value *PtrA1 =
17937 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17938
17939 while (++It != End) {
17940 if (It->isDebugOrPseudoInst())
17941 continue;
17942 if (MaxLookupDist-- == 0)
17943 break;
17944 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17945 const Value *PtrB1 =
17946 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17947 DL, OffsetB);
17948 if (PtrA1 == PtrB1 &&
17949 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17950 .abs() == 16)
17951 return true;
17952 }
17953 }
17954
17955 return false;
17956}
17957
17958/// Lower an interleaved store into a stN intrinsic.
17959///
17960/// E.g. Lower an interleaved store (Factor = 3):
17961/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17962/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17963/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17964///
17965/// Into:
17966/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17967/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17968/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17969/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17970///
17971/// Note that the new shufflevectors will be removed and we'll only generate one
17972/// st3 instruction in CodeGen.
17973///
17974/// Example for a more general valid mask (Factor 3). Lower:
17975/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17976/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17977/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17978///
17979/// Into:
17980/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17981/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17982/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17983/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17985 Value *LaneMask,
17986 ShuffleVectorInst *SVI,
17987 unsigned Factor,
17988 const APInt &GapMask) const {
17989
17990 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17991 "Invalid interleave factor");
17992 auto *SI = dyn_cast<StoreInst>(Store);
17993 if (!SI)
17994 return false;
17995 assert(!LaneMask && GapMask.popcount() == Factor &&
17996 "Unexpected mask on store");
17997
17998 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17999 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18000
18001 unsigned LaneLen = VecTy->getNumElements() / Factor;
18002 Type *EltTy = VecTy->getElementType();
18003 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18004
18005 const DataLayout &DL = SI->getDataLayout();
18006 bool UseScalable;
18007
18008 // Skip if we do not have NEON and skip illegal vector types. We can
18009 // "legalize" wide vector types into multiple interleaved accesses as long as
18010 // the vector types are divisible by 128.
18011 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18012 return false;
18013
18014 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18015
18016 Value *Op0 = SVI->getOperand(0);
18017 Value *Op1 = SVI->getOperand(1);
18018 IRBuilder<> Builder(SI);
18019
18020 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18021 // vectors to integer vectors.
18022 if (EltTy->isPointerTy()) {
18023 Type *IntTy = DL.getIntPtrType(EltTy);
18024 unsigned NumOpElts =
18025 cast<FixedVectorType>(Op0->getType())->getNumElements();
18026
18027 // Convert to the corresponding integer vector.
18028 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18029 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18030 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18031
18032 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18033 }
18034
18035 // If we're going to generate more than one store, reset the lane length
18036 // and sub-vector type to something legal.
18037 LaneLen /= NumStores;
18038 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18039
18040 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18041 : SubVecTy;
18042
18043 // The base address of the store.
18044 Value *BaseAddr = SI->getPointerOperand();
18045
18046 auto Mask = SVI->getShuffleMask();
18047
18048 // Sanity check if all the indices are NOT in range.
18049 // If mask is `poison`, `Mask` may be a vector of -1s.
18050 // If all of them are `poison`, OOB read will happen later.
18051 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
18052 return false;
18053 }
18054 // A 64bit st2 which does not start at element 0 will involved adding extra
18055 // ext elements making the st2 unprofitable, and if there is a nearby store
18056 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18057 // zip;ldp pair which has higher throughput.
18058 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18059 (Mask[0] != 0 ||
18060 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18061 DL) ||
18062 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18063 BaseAddr, DL)))
18064 return false;
18065
18066 Type *PtrTy = SI->getPointerOperandType();
18067 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18068 STVTy->getElementCount());
18069
18070 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18071 UseScalable, STVTy, PtrTy);
18072
18073 Value *PTrue = nullptr;
18074 if (UseScalable) {
18075 std::optional<unsigned> PgPattern =
18076 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18077 if (Subtarget->getMinSVEVectorSizeInBits() ==
18078 Subtarget->getMaxSVEVectorSizeInBits() &&
18079 Subtarget->getMinSVEVectorSizeInBits() ==
18080 DL.getTypeSizeInBits(SubVecTy))
18081 PgPattern = AArch64SVEPredPattern::all;
18082
18083 auto *PTruePat =
18084 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18085 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18086 {PTruePat});
18087 }
18088
18089 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18090
18092
18093 // Split the shufflevector operands into sub vectors for the new stN call.
18094 for (unsigned i = 0; i < Factor; i++) {
18095 Value *Shuffle;
18096 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18097 if (Mask[IdxI] >= 0) {
18098 Shuffle = Builder.CreateShuffleVector(
18099 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18100 } else {
18101 unsigned StartMask = 0;
18102 for (unsigned j = 1; j < LaneLen; j++) {
18103 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18104 if (Mask[IdxJ] >= 0) {
18105 StartMask = Mask[IdxJ] - j;
18106 break;
18107 }
18108 }
18109 // Note: Filling undef gaps with random elements is ok, since
18110 // those elements were being written anyway (with undefs).
18111 // In the case of all undefs we're defaulting to using elems from 0
18112 // Note: StartMask cannot be negative, it's checked in
18113 // isReInterleaveMask
18114 Shuffle = Builder.CreateShuffleVector(
18115 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18116 }
18117
18118 if (UseScalable)
18119 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18120 Shuffle, uint64_t(0));
18121
18122 Ops.push_back(Shuffle);
18123 }
18124
18125 if (UseScalable)
18126 Ops.push_back(PTrue);
18127
18128 // If we generating more than one store, we compute the base address of
18129 // subsequent stores as an offset from the previous.
18130 if (StoreCount > 0)
18131 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18132 BaseAddr, LaneLen * Factor);
18133
18134 Ops.push_back(BaseAddr);
18135 Builder.CreateCall(StNFunc, Ops);
18136 }
18137 return true;
18138}
18139
18141 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18142 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18143 if (Factor != 2 && Factor != 4) {
18144 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
18145 return false;
18146 }
18147 auto *LI = dyn_cast<LoadInst>(Load);
18148 if (!LI)
18149 return false;
18150 assert(!Mask && "Unexpected mask on a load\n");
18151
18153
18154 const DataLayout &DL = LI->getModule()->getDataLayout();
18155 bool UseScalable;
18156 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18157 return false;
18158
18159 // TODO: Add support for using SVE instructions with fixed types later, using
18160 // the code from lowerInterleavedLoad to obtain the correct container type.
18161 if (UseScalable && !VTy->isScalableTy())
18162 return false;
18163
18164 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18165 VectorType *LdTy =
18167 VTy->getElementCount().divideCoefficientBy(NumLoads));
18168
18169 Type *PtrTy = LI->getPointerOperandType();
18170 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18171 UseScalable, LdTy, PtrTy);
18172
18173 IRBuilder<> Builder(LI);
18174 Value *Pred = nullptr;
18175 if (UseScalable)
18176 Pred =
18177 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18178
18179 Value *BaseAddr = LI->getPointerOperand();
18180 Value *Result = nullptr;
18181 if (NumLoads > 1) {
18182 // Create multiple legal small ldN.
18183 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18184 for (unsigned I = 0; I < NumLoads; ++I) {
18185 Value *Offset = Builder.getInt64(I * Factor);
18186
18187 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18188 Value *LdN = nullptr;
18189 if (UseScalable)
18190 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18191 else
18192 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18193 Value *Idx =
18194 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18195 for (unsigned J = 0; J < Factor; ++J) {
18196 ExtractedLdValues[J] = Builder.CreateInsertVector(
18197 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18198 }
18199 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18200 }
18201
18202 // Merge the values from different factors.
18203 Result = PoisonValue::get(DI->getType());
18204 for (unsigned J = 0; J < Factor; ++J)
18205 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18206 } else {
18207 if (UseScalable)
18208 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18209 else
18210 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18211 }
18212
18213 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18214 DI->replaceAllUsesWith(Result);
18215 return true;
18216}
18217
18219 Instruction *Store, Value *Mask,
18220 ArrayRef<Value *> InterleavedValues) const {
18221 unsigned Factor = InterleavedValues.size();
18222 if (Factor != 2 && Factor != 4) {
18223 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
18224 return false;
18225 }
18227 if (!SI)
18228 return false;
18229 assert(!Mask && "Unexpected mask on plain store");
18230
18231 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18232 const DataLayout &DL = SI->getModule()->getDataLayout();
18233
18234 bool UseScalable;
18235 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18236 return false;
18237
18238 // TODO: Add support for using SVE instructions with fixed types later, using
18239 // the code from lowerInterleavedStore to obtain the correct container type.
18240 if (UseScalable && !VTy->isScalableTy())
18241 return false;
18242
18243 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18244
18245 VectorType *StTy =
18247 VTy->getElementCount().divideCoefficientBy(NumStores));
18248
18249 Type *PtrTy = SI->getPointerOperandType();
18250 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18251 UseScalable, StTy, PtrTy);
18252
18253 IRBuilder<> Builder(SI);
18254
18255 Value *BaseAddr = SI->getPointerOperand();
18256 Value *Pred = nullptr;
18257
18258 if (UseScalable)
18259 Pred =
18260 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18261
18262 auto ExtractedValues = InterleavedValues;
18263 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18264 if (UseScalable)
18265 StoreOperands.push_back(Pred);
18266 StoreOperands.push_back(BaseAddr);
18267 for (unsigned I = 0; I < NumStores; ++I) {
18268 Value *Address = BaseAddr;
18269 if (NumStores > 1) {
18270 Value *Offset = Builder.getInt64(I * Factor);
18271 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18272 Value *Idx =
18273 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18274 for (unsigned J = 0; J < Factor; J++) {
18275 StoreOperands[J] =
18276 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18277 }
18278 // update the address
18279 StoreOperands[StoreOperands.size() - 1] = Address;
18280 }
18281 Builder.CreateCall(StNFunc, StoreOperands);
18282 }
18283 return true;
18284}
18285
18287 LLVMContext &Context, const MemOp &Op,
18288 const AttributeList &FuncAttributes) const {
18289 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18290 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18291 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18292 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18293 // taken one instruction to materialize the v2i64 zero and one store (with
18294 // restrictive addressing mode). Just do i64 stores.
18295 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18296 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18297 if (Op.isAligned(AlignCheck))
18298 return true;
18299 unsigned Fast;
18300 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18302 Fast;
18303 };
18304
18305 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18306 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18307 return MVT::v16i8;
18308 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18309 return MVT::f128;
18310 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18311 return MVT::i64;
18312 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18313 return MVT::i32;
18314 return MVT::Other;
18315}
18316
18318 const MemOp &Op, const AttributeList &FuncAttributes) const {
18319 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18320 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18321 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18322 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18323 // taken one instruction to materialize the v2i64 zero and one store (with
18324 // restrictive addressing mode). Just do i64 stores.
18325 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18326 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18327 if (Op.isAligned(AlignCheck))
18328 return true;
18329 unsigned Fast;
18330 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18332 Fast;
18333 };
18334
18335 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18336 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18337 return LLT::fixed_vector(2, 64);
18338 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18339 return LLT::scalar(128);
18340 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18341 return LLT::scalar(64);
18342 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18343 return LLT::scalar(32);
18344 return LLT();
18345}
18346
18347// 12-bit optionally shifted immediates are legal for adds.
18349 if (Immed == std::numeric_limits<int64_t>::min()) {
18350 return false;
18351 }
18352 // Same encoding for add/sub, just flip the sign.
18353 return isLegalArithImmed((uint64_t)std::abs(Immed));
18354}
18355
18357 // We will only emit addvl/inc* instructions for SVE2
18358 if (!Subtarget->hasSVE2())
18359 return false;
18360
18361 // addvl's immediates are in terms of the number of bytes in a register.
18362 // Since there are 16 in the base supported size (128bits), we need to
18363 // divide the immediate by that much to give us a useful immediate to
18364 // multiply by vscale. We can't have a remainder as a result of this.
18365 if (Imm % 16 == 0)
18366 return isInt<6>(Imm / 16);
18367
18368 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18369 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18370 // of addvl as a result, so only take h|w|d into account.
18371 // Dec[h|w|d] will cover subtractions.
18372 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18373 // FIXME: Can we make use of other patterns to cover other immediates?
18374
18375 // inch|dech
18376 if (Imm % 8 == 0)
18377 return std::abs(Imm / 8) <= 16;
18378 // incw|decw
18379 if (Imm % 4 == 0)
18380 return std::abs(Imm / 4) <= 16;
18381 // incd|decd
18382 if (Imm % 2 == 0)
18383 return std::abs(Imm / 2) <= 16;
18384
18385 return false;
18386}
18387
18388// Return false to prevent folding
18389// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18390// if the folding leads to worse code.
18392 SDValue AddNode, SDValue ConstNode) const {
18393 // Let the DAGCombiner decide for vector types and large types.
18394 const EVT VT = AddNode.getValueType();
18395 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18396 return true;
18397
18398 // It is worse if c1 is legal add immediate, while c1*c2 is not
18399 // and has to be composed by at least two instructions.
18400 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18401 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18402 const int64_t C1 = C1Node->getSExtValue();
18403 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18405 return true;
18407 // Adapt to the width of a register.
18408 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18409 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18410 if (Insn.size() > 1)
18411 return false;
18412
18413 // Default to true and let the DAGCombiner decide.
18414 return true;
18415}
18416
18417// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18418// immediates is the same as for an add or a sub.
18420 return isLegalAddImmediate(Immed);
18421}
18422
18423/// isLegalAddressingMode - Return true if the addressing mode represented
18424/// by AM is legal for this target, for a load/store of the specified type.
18426 const AddrMode &AMode, Type *Ty,
18427 unsigned AS, Instruction *I) const {
18428 // AArch64 has five basic addressing modes:
18429 // reg
18430 // reg + 9-bit signed offset
18431 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18432 // reg1 + reg2
18433 // reg + SIZE_IN_BYTES * reg
18434
18435 // No global is ever allowed as a base.
18436 if (AMode.BaseGV)
18437 return false;
18438
18439 // No reg+reg+imm addressing.
18440 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18441 return false;
18442
18443 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18444 // `2*ScaledReg` into `BaseReg + ScaledReg`
18445 AddrMode AM = AMode;
18446 if (AM.Scale && !AM.HasBaseReg) {
18447 if (AM.Scale == 1) {
18448 AM.HasBaseReg = true;
18449 AM.Scale = 0;
18450 } else if (AM.Scale == 2) {
18451 AM.HasBaseReg = true;
18452 AM.Scale = 1;
18453 } else {
18454 return false;
18455 }
18456 }
18457
18458 // A base register is required in all addressing modes.
18459 if (!AM.HasBaseReg)
18460 return false;
18461
18462 if (Ty->isScalableTy()) {
18463 if (isa<ScalableVectorType>(Ty)) {
18464 // See if we have a foldable vscale-based offset, for vector types which
18465 // are either legal or smaller than the minimum; more work will be
18466 // required if we need to consider addressing for types which need
18467 // legalization by splitting.
18468 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18469 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18470 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18471 isPowerOf2_64(VecNumBytes))
18472 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18473
18474 uint64_t VecElemNumBytes =
18475 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18476 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18477 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18478 }
18479
18480 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18481 }
18482
18483 // No scalable offsets allowed for non-scalable types.
18484 if (AM.ScalableOffset)
18485 return false;
18486
18487 // check reg + imm case:
18488 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18489 uint64_t NumBytes = 0;
18490 if (Ty->isSized()) {
18491 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18492 NumBytes = NumBits / 8;
18493 if (!isPowerOf2_64(NumBits))
18494 NumBytes = 0;
18495 }
18496
18497 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18498 AM.Scale);
18499}
18500
18501// Check whether the 2 offsets belong to the same imm24 range, and their high
18502// 12bits are same, then their high part can be decoded with the offset of add.
18503int64_t
18505 int64_t MaxOffset) const {
18506 int64_t HighPart = MinOffset & ~0xfffULL;
18507 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18508 // Rebase the value to an integer multiple of imm12.
18509 return HighPart;
18510 }
18511
18512 return 0;
18513}
18514
18516 // Consider splitting large offset of struct or array.
18517 return true;
18518}
18519
18521 const MachineFunction &MF, EVT VT) const {
18522 EVT ScalarVT = VT.getScalarType();
18523
18524 if (!ScalarVT.isSimple())
18525 return false;
18526
18527 switch (ScalarVT.getSimpleVT().SimpleTy) {
18528 case MVT::f16:
18529 return Subtarget->hasFullFP16();
18530 case MVT::f32:
18531 case MVT::f64:
18532 return true;
18533 case MVT::bf16:
18534 return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18535 Subtarget->isNonStreamingSVEorSME2Available();
18536 default:
18537 break;
18538 }
18539
18540 return false;
18541}
18542
18544 Type *Ty) const {
18545 switch (Ty->getScalarType()->getTypeID()) {
18546 case Type::FloatTyID:
18547 case Type::DoubleTyID:
18548 return true;
18549 default:
18550 return false;
18551 }
18552}
18553
18555 EVT VT, CodeGenOptLevel OptLevel) const {
18556 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18558}
18559
18560const MCPhysReg *
18562 // LR is a callee-save register, but we must treat it as clobbered by any call
18563 // site. Hence we include LR in the scratch registers, which are in turn added
18564 // as implicit-defs for stackmaps and patchpoints.
18565 static const MCPhysReg ScratchRegs[] = {
18566 AArch64::X16, AArch64::X17, AArch64::LR, 0
18567 };
18568 return ScratchRegs;
18569}
18570
18572 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18573 return RCRegs;
18574}
18575
18576bool
18578 CombineLevel Level) const {
18579 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18580 N->getOpcode() == ISD::SRL) &&
18581 "Expected shift op");
18582
18583 SDValue ShiftLHS = N->getOperand(0);
18584 EVT VT = N->getValueType(0);
18585
18586 if (!ShiftLHS->hasOneUse())
18587 return false;
18588
18589 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18590 !ShiftLHS.getOperand(0)->hasOneUse())
18591 return false;
18592
18593 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18594 // combine it with shift 'N' to let it be lowered to UBFX except:
18595 // ((x >> C) & mask) << C.
18596 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18597 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18598 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18599 if (isMask_64(TruncMask)) {
18600 SDValue AndLHS = ShiftLHS.getOperand(0);
18601 if (AndLHS.getOpcode() == ISD::SRL) {
18602 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18603 if (N->getOpcode() == ISD::SHL)
18604 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18605 return SRLC->getZExtValue() == SHLC->getZExtValue();
18606 return false;
18607 }
18608 }
18609 }
18610 }
18611 return true;
18612}
18613
18615 const SDNode *N) const {
18616 assert(N->getOpcode() == ISD::XOR &&
18617 (N->getOperand(0).getOpcode() == ISD::SHL ||
18618 N->getOperand(0).getOpcode() == ISD::SRL) &&
18619 "Expected XOR(SHIFT) pattern");
18620
18621 // Only commute if the entire NOT mask is a hidden shifted mask.
18622 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18623 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18624 if (XorC && ShiftC) {
18625 unsigned MaskIdx, MaskLen;
18626 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18627 unsigned ShiftAmt = ShiftC->getZExtValue();
18628 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18629 if (N->getOperand(0).getOpcode() == ISD::SHL)
18630 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18631 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18632 }
18633 }
18634
18635 return false;
18636}
18637
18639 const SDNode *N, CombineLevel Level) const {
18640 assert(((N->getOpcode() == ISD::SHL &&
18641 N->getOperand(0).getOpcode() == ISD::SRL) ||
18642 (N->getOpcode() == ISD::SRL &&
18643 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18644 "Expected shift-shift mask");
18645 // Don't allow multiuse shift folding with the same shift amount.
18646 if (!N->getOperand(0)->hasOneUse())
18647 return false;
18648
18649 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18650 EVT VT = N->getValueType(0);
18651 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18652 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18653 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18654 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18655 }
18656
18657 // We do not need to fold when this shifting used in specific load case:
18658 // (ldr x, (add x, (shl (srl x, c1) 2)))
18659 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18660 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18661 unsigned ShlAmt = C2->getZExtValue();
18662 if (auto ShouldADD = *N->user_begin();
18663 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18664 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18665 EVT MemVT = Load->getMemoryVT();
18666
18667 if (Load->getValueType(0).isScalableVector())
18668 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18669
18670 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18671 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18672 }
18673 }
18674 }
18675 }
18676
18677 return true;
18678}
18679
18681 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18682 SDValue Y) const {
18683 return VT.isScalableVector() && isTypeLegal(VT) &&
18684 SelectOpcode == ISD::VSELECT;
18685}
18686
18688 Type *Ty) const {
18689 assert(Ty->isIntegerTy());
18690
18691 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18692 if (BitSize == 0)
18693 return false;
18694
18695 int64_t Val = Imm.getSExtValue();
18696 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18697 return true;
18698
18699 if (Val < 0)
18700 Val = ~Val;
18701 if (BitSize == 32)
18702 Val &= (1LL << 32) - 1;
18703
18704 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18705 // MOVZ is free so return true for one or fewer MOVK.
18706 return Shift < 3;
18707}
18708
18710 unsigned Index) const {
18712 return false;
18713
18714 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18715}
18716
18717/// Turn vector tests of the signbit in the form of:
18718/// xor (sra X, elt_size(X)-1), -1
18719/// into:
18720/// cmge X, X, #0
18722 const AArch64Subtarget *Subtarget) {
18723 EVT VT = N->getValueType(0);
18724 if (!Subtarget->hasNEON() || !VT.isVector())
18725 return SDValue();
18726
18727 // There must be a shift right algebraic before the xor, and the xor must be a
18728 // 'not' operation.
18729 SDValue Shift = N->getOperand(0);
18730 SDValue Ones = N->getOperand(1);
18731 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18733 return SDValue();
18734
18735 // The shift should be smearing the sign bit across each vector element.
18736 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18737 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18738 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18739 return SDValue();
18740
18741 SDLoc DL(N);
18742 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18743 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18744}
18745
18746// Given a vecreduce_add node, detect the below pattern and convert it to the
18747// node sequence with UABDL, [S|U]ADB and UADDLP.
18748//
18749// i32 vecreduce_add(
18750// v16i32 abs(
18751// v16i32 sub(
18752// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18753//
18754// or
18755//
18756// i32 vecreduce_add(
18757// v16i32 zext(
18758// v16i16 abs(
18759// v16i16 sub(
18760// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18761//
18762// =================>
18763// i32 vecreduce_add(
18764// v4i32 UADDLP(
18765// v8i16 add(
18766// v8i16 zext(
18767// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18768// v8i16 zext(
18769// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18771 SelectionDAG &DAG) {
18772 // Assumed i32 vecreduce_add
18773 if (N->getValueType(0) != MVT::i32)
18774 return SDValue();
18775
18776 SDValue VecReduceOp0 = N->getOperand(0);
18777 bool SawTrailingZext = false;
18778 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18779 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18780 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18781 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18782 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18783 SawTrailingZext = true;
18784 VecReduceOp0 = VecReduceOp0.getOperand(0);
18785 }
18786
18787 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18788 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18789 // Assumed v16i16 or v16i32 abs input
18790 unsigned Opcode = VecReduceOp0.getOpcode();
18791 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18792 return SDValue();
18793
18794 SDValue ABS = VecReduceOp0;
18795 // Assumed v16i16 or v16i32 sub
18796 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18797 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18798 return SDValue();
18799
18800 SDValue SUB = ABS->getOperand(0);
18801 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18802 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18803 // Assumed v16i16 or v16i32 type
18804 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18805 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18806 return SDValue();
18807
18808 // Assumed zext or sext
18809 bool IsZExt = false;
18810 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18811 IsZExt = true;
18812 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18813 IsZExt = false;
18814 } else
18815 return SDValue();
18816
18817 SDValue EXT0 = SUB->getOperand(0);
18818 SDValue EXT1 = SUB->getOperand(1);
18819 // Assumed zext's operand has v16i8 type
18820 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18821 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18822 return SDValue();
18823
18824 // Pattern is detected. Let's convert it to sequence of nodes.
18825 SDLoc DL(N);
18826
18827 // First, create the node pattern of UABD/SABD.
18828 SDValue UABDHigh8Op0 =
18829 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18830 DAG.getConstant(8, DL, MVT::i64));
18831 SDValue UABDHigh8Op1 =
18832 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18833 DAG.getConstant(8, DL, MVT::i64));
18834 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18835 UABDHigh8Op0, UABDHigh8Op1);
18836 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18837
18838 // Second, create the node pattern of UABAL.
18839 SDValue UABDLo8Op0 =
18840 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18841 DAG.getConstant(0, DL, MVT::i64));
18842 SDValue UABDLo8Op1 =
18843 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18844 DAG.getConstant(0, DL, MVT::i64));
18845 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18846 UABDLo8Op0, UABDLo8Op1);
18847 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18848 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18849
18850 // Third, create the node of UADDLP.
18851 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18852
18853 // Fourth, create the node of VECREDUCE_ADD.
18854 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18855}
18856
18857static SDValue
18859 const AArch64Subtarget *ST) {
18860 if (DCI.isBeforeLegalize())
18861 return SDValue();
18862
18863 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18864 /*IsEqual=*/false))
18865 return While;
18866
18867 if (!N->getValueType(0).isScalableVector() ||
18868 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18869 return SDValue();
18870
18871 // Count the number of users which are extract_vectors.
18872 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
18873 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
18874 });
18875
18876 auto MaskEC = N->getValueType(0).getVectorElementCount();
18877 if (!MaskEC.isKnownMultipleOf(NumExts))
18878 return SDValue();
18879
18880 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
18881 if (ExtMinEC.getKnownMinValue() < 2)
18882 return SDValue();
18883
18884 SmallVector<SDNode *> Extracts(NumExts, nullptr);
18885 for (SDNode *Use : N->users()) {
18886 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18887 continue;
18888
18889 // Ensure the extract type is correct (e.g. if NumExts is 4 and
18890 // the mask return type is nxv8i1, each extract should be nxv2i1.
18891 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18892 return SDValue();
18893
18894 // There should be exactly one extract for each part of the mask.
18895 unsigned Offset = Use->getConstantOperandVal(1);
18896 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18897 if (Extracts[Part] != nullptr)
18898 return SDValue();
18899
18900 Extracts[Part] = Use;
18901 }
18902
18903 SelectionDAG &DAG = DCI.DAG;
18904 SDLoc DL(N);
18905 SDValue ID =
18906 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18907
18908 SDValue Idx = N->getOperand(0);
18909 SDValue TC = N->getOperand(1);
18910 if (Idx.getValueType() != MVT::i64) {
18911 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18912 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18913 }
18914
18915 // Create the whilelo_x2 intrinsics from each pair of extracts
18916 EVT ExtVT = Extracts[0]->getValueType(0);
18917 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
18918 auto R =
18919 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18920 DCI.CombineTo(Extracts[0], R.getValue(0));
18921 DCI.CombineTo(Extracts[1], R.getValue(1));
18922 SmallVector<SDValue> Concats = {DAG.getNode(
18923 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
18924
18925 if (NumExts == 2) {
18926 assert(N->getValueType(0) == DoubleExtVT);
18927 return Concats[0];
18928 }
18929
18930 auto Elts =
18931 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
18932 for (unsigned I = 2; I < NumExts; I += 2) {
18933 // After the first whilelo_x2, we need to increment the starting value.
18934 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
18935 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18936 DCI.CombineTo(Extracts[I], R.getValue(0));
18937 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18938 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
18939 R.getValue(0), R.getValue(1)));
18940 }
18941
18942 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
18943}
18944
18945// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18946// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18947// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18948// If we have vectors larger than v16i8 we extract v16i8 vectors,
18949// Follow the same steps above to get DOT instructions concatenate them
18950// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18952 const AArch64Subtarget *ST) {
18953 if (!ST->isNeonAvailable())
18954 return SDValue();
18955
18956 if (!ST->hasDotProd())
18958
18959 SDValue Op0 = N->getOperand(0);
18960 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18961 Op0.getValueType().getVectorElementType() != MVT::i32)
18962 return SDValue();
18963
18964 unsigned ExtOpcode = Op0.getOpcode();
18965 SDValue A = Op0;
18966 SDValue B;
18967 unsigned DotOpcode;
18968 if (ExtOpcode == ISD::MUL) {
18969 A = Op0.getOperand(0);
18970 B = Op0.getOperand(1);
18971 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18972 return SDValue();
18973 auto OpCodeA = A.getOpcode();
18974 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18975 return SDValue();
18976
18977 auto OpCodeB = B.getOpcode();
18978 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18979 return SDValue();
18980
18981 if (OpCodeA == OpCodeB) {
18982 DotOpcode =
18983 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18984 } else {
18985 // Check USDOT support support
18986 if (!ST->hasMatMulInt8())
18987 return SDValue();
18988 DotOpcode = AArch64ISD::USDOT;
18989 if (OpCodeA == ISD::SIGN_EXTEND)
18990 std::swap(A, B);
18991 }
18992 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18993 DotOpcode = AArch64ISD::UDOT;
18994 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18995 DotOpcode = AArch64ISD::SDOT;
18996 } else {
18997 return SDValue();
18998 }
18999
19000 EVT Op0VT = A.getOperand(0).getValueType();
19001 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19002 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19003 if (!IsValidElementCount || !IsValidSize)
19004 return SDValue();
19005
19006 SDLoc DL(Op0);
19007 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19008 // the extend B.
19009 if (!B)
19010 B = DAG.getConstant(1, DL, Op0VT);
19011 else
19012 B = B.getOperand(0);
19013
19014 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19015 unsigned NumOfVecReduce;
19016 EVT TargetType;
19017 if (IsMultipleOf16) {
19018 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19019 TargetType = MVT::v4i32;
19020 } else {
19021 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19022 TargetType = MVT::v2i32;
19023 }
19024 // Handle the case where we need to generate only one Dot operation.
19025 if (NumOfVecReduce == 1) {
19026 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19027 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19028 A.getOperand(0), B);
19029 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19030 }
19031 // Generate Dot instructions that are multiple of 16.
19032 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19033 SmallVector<SDValue, 4> SDotVec16;
19034 unsigned I = 0;
19035 for (; I < VecReduce16Num; I += 1) {
19036 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19037 SDValue Op0 =
19038 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19039 DAG.getConstant(I * 16, DL, MVT::i64));
19040 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19041 DAG.getConstant(I * 16, DL, MVT::i64));
19042 SDValue Dot =
19043 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19044 SDotVec16.push_back(Dot);
19045 }
19046 // Concatenate dot operations.
19047 EVT SDot16EVT =
19048 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19049 SDValue ConcatSDot16 =
19050 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19051 SDValue VecReduceAdd16 =
19052 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19053 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19054 if (VecReduce8Num == 0)
19055 return VecReduceAdd16;
19056
19057 // Generate the remainder Dot operation that is multiple of 8.
19058 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19059 SDValue Vec8Op0 =
19060 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19061 DAG.getConstant(I * 16, DL, MVT::i64));
19062 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19063 DAG.getConstant(I * 16, DL, MVT::i64));
19064 SDValue Dot =
19065 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19066 SDValue VecReduceAdd8 =
19067 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19068 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19069 VecReduceAdd8);
19070}
19071
19072// Given an (integer) vecreduce, we know the order of the inputs does not
19073// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19074// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19075// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19077 auto DetectAddExtract = [&](SDValue A) {
19078 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19079 // UADDLP(x) if found.
19080 assert(A.getOpcode() == ISD::ADD);
19081 EVT VT = A.getValueType();
19082 SDValue Op0 = A.getOperand(0);
19083 SDValue Op1 = A.getOperand(1);
19084 if (Op0.getOpcode() != Op1.getOpcode() ||
19085 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19086 Op0.getOpcode() != ISD::SIGN_EXTEND))
19087 return SDValue();
19088 SDValue Ext0 = Op0.getOperand(0);
19089 SDValue Ext1 = Op1.getOperand(0);
19090 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19092 Ext0.getOperand(0) != Ext1.getOperand(0))
19093 return SDValue();
19094 // Check that the type is twice the add types, and the extract are from
19095 // upper/lower parts of the same source.
19097 VT.getVectorNumElements() * 2)
19098 return SDValue();
19099 if ((Ext0.getConstantOperandVal(1) != 0 ||
19101 (Ext1.getConstantOperandVal(1) != 0 ||
19103 return SDValue();
19104 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19105 : AArch64ISD::SADDLP;
19106 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19107 };
19108
19109 if (SDValue R = DetectAddExtract(A))
19110 return R;
19111
19112 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19113 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19114 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19115 A.getOperand(1));
19116 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19117 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19118 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19119 A.getOperand(0));
19120 return SDValue();
19121}
19122
19123// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19124// UADDLV(concat), where the concat represents the 64-bit zext sources.
19126 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19127 // UADDLV(concat(zext, zext)) if found.
19128 assert(A.getOpcode() == ISD::ADD);
19129 EVT VT = A.getValueType();
19130 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19131 return SDValue();
19132 SDValue Op0 = A.getOperand(0);
19133 SDValue Op1 = A.getOperand(1);
19134 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19135 return SDValue();
19136 SDValue Ext0 = Op0.getOperand(0);
19137 SDValue Ext1 = Op1.getOperand(0);
19138 EVT ExtVT0 = Ext0.getValueType();
19139 EVT ExtVT1 = Ext1.getValueType();
19140 // Check zext VTs are the same and 64-bit length.
19141 if (ExtVT0 != ExtVT1 ||
19142 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19143 return SDValue();
19144 // Get VT for concat of zext sources.
19145 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19146 SDValue Concat =
19147 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19148
19149 switch (VT.getSimpleVT().SimpleTy) {
19150 case MVT::v2i64:
19151 case MVT::v4i32:
19152 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19153 case MVT::v8i16: {
19154 SDValue Uaddlv =
19155 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19156 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19157 }
19158 default:
19159 llvm_unreachable("Unhandled vector type");
19160 }
19161}
19162
19164 SDValue A = N->getOperand(0);
19165 if (A.getOpcode() == ISD::ADD) {
19166 if (SDValue R = performUADDVAddCombine(A, DAG))
19167 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19168 else if (SDValue R = performUADDVZextCombine(A, DAG))
19169 return R;
19170 }
19171
19172 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19173 MVT OpVT = A.getSimpleValueType();
19174 assert(N->getSimpleValueType(0) == OpVT &&
19175 "The operand type should be consistent with the result type of UADDV");
19177 Mask.clearBit(0);
19178 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19179 if (KnownLeadingLanes.isZero())
19180 return A;
19181
19182 return SDValue();
19183}
19184
19187 const AArch64Subtarget *Subtarget) {
19188 if (DCI.isBeforeLegalizeOps())
19189 return SDValue();
19190
19191 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19192}
19193
19194SDValue
19195AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19196 SelectionDAG &DAG,
19197 SmallVectorImpl<SDNode *> &Created) const {
19198 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19199 if (isIntDivCheap(N->getValueType(0), Attr))
19200 return SDValue(N, 0); // Lower SDIV as SDIV
19201
19202 EVT VT = N->getValueType(0);
19203
19204 // If SVE is available, we can generate
19205 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19206 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19207 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19208 return SDValue(N, 0);
19209
19210 // fold (sdiv X, pow2)
19211 if ((VT != MVT::i32 && VT != MVT::i64) ||
19212 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19213 return SDValue();
19214
19215 // If the divisor is 2 or -2, the default expansion is better. It will add
19216 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19217 if (Divisor == 2 ||
19218 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19219 return SDValue();
19220
19221 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19222}
19223
19224SDValue
19225AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19226 SelectionDAG &DAG,
19227 SmallVectorImpl<SDNode *> &Created) const {
19228 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19229 if (isIntDivCheap(N->getValueType(0), Attr))
19230 return SDValue(N, 0); // Lower SREM as SREM
19231
19232 EVT VT = N->getValueType(0);
19233
19234 // For scalable and fixed types, mark them as cheap so we can handle it much
19235 // later. This allows us to handle larger than legal types.
19236 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19237 return SDValue(N, 0);
19238
19239 // fold (srem X, pow2)
19240 if ((VT != MVT::i32 && VT != MVT::i64) ||
19241 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19242 return SDValue();
19243
19244 unsigned Lg2 = Divisor.countr_zero();
19245 if (Lg2 == 0)
19246 return SDValue();
19247
19248 SDLoc DL(N);
19249 SDValue N0 = N->getOperand(0);
19250 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19251 SDValue Zero = DAG.getConstant(0, DL, VT);
19252 SDValue CCVal, CSNeg;
19253 if (Lg2 == 1) {
19254 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19255 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19256 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19257
19258 Created.push_back(Cmp.getNode());
19259 Created.push_back(And.getNode());
19260 } else {
19261 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19262 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19263
19264 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19265 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19266 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19267 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19268 Negs.getValue(1));
19269
19270 Created.push_back(Negs.getNode());
19271 Created.push_back(AndPos.getNode());
19272 Created.push_back(AndNeg.getNode());
19273 }
19274
19275 return CSNeg;
19276}
19277
19278static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19279 switch(getIntrinsicID(S.getNode())) {
19280 default:
19281 break;
19282 case Intrinsic::aarch64_sve_cntb:
19283 return 8;
19284 case Intrinsic::aarch64_sve_cnth:
19285 return 16;
19286 case Intrinsic::aarch64_sve_cntw:
19287 return 32;
19288 case Intrinsic::aarch64_sve_cntd:
19289 return 64;
19290 }
19291 return {};
19292}
19293
19294/// Calculates what the pre-extend type is, based on the extension
19295/// operation node provided by \p Extend.
19296///
19297/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19298/// pre-extend type is pulled directly from the operand, while other extend
19299/// operations need a bit more inspection to get this information.
19300///
19301/// \param Extend The SDNode from the DAG that represents the extend operation
19302///
19303/// \returns The type representing the \p Extend source type, or \p MVT::Other
19304/// if no valid type can be determined
19306 switch (Extend.getOpcode()) {
19307 case ISD::SIGN_EXTEND:
19308 case ISD::ZERO_EXTEND:
19309 case ISD::ANY_EXTEND:
19310 return Extend.getOperand(0).getValueType();
19311 case ISD::AssertSext:
19312 case ISD::AssertZext:
19314 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19315 if (!TypeNode)
19316 return MVT::Other;
19317 return TypeNode->getVT();
19318 }
19319 case ISD::AND: {
19322 if (!Constant)
19323 return MVT::Other;
19324
19325 uint32_t Mask = Constant->getZExtValue();
19326
19327 if (Mask == UCHAR_MAX)
19328 return MVT::i8;
19329 else if (Mask == USHRT_MAX)
19330 return MVT::i16;
19331 else if (Mask == UINT_MAX)
19332 return MVT::i32;
19333
19334 return MVT::Other;
19335 }
19336 default:
19337 return MVT::Other;
19338 }
19339}
19340
19341/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19342/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19343/// SExt/ZExt rather than the scalar SExt/ZExt
19345 EVT VT = BV.getValueType();
19346 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19348 return SDValue();
19349
19350 // Use the first item in the buildvector/shuffle to get the size of the
19351 // extend, and make sure it looks valid.
19352 SDValue Extend = BV->getOperand(0);
19353 unsigned ExtendOpcode = Extend.getOpcode();
19354 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19355 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19356 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19357 ExtendOpcode == ISD::AssertSext;
19358 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19359 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19360 return SDValue();
19361 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19362 // ensure calculatePreExtendType will work without issue.
19363 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19364 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19365 return SDValue();
19366
19367 // Restrict valid pre-extend data type
19368 EVT PreExtendType = calculatePreExtendType(Extend);
19369 if (PreExtendType == MVT::Other ||
19370 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19371 return SDValue();
19372
19373 // Make sure all other operands are equally extended.
19374 bool SeenZExtOrSExt = !IsAnyExt;
19375 for (SDValue Op : drop_begin(BV->ops())) {
19376 if (Op.isUndef())
19377 continue;
19378
19379 if (calculatePreExtendType(Op) != PreExtendType)
19380 return SDValue();
19381
19382 unsigned Opc = Op.getOpcode();
19383 if (Opc == ISD::ANY_EXTEND)
19384 continue;
19385
19386 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19388
19389 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19390 return SDValue();
19391
19392 IsSExt = OpcIsSExt;
19393 SeenZExtOrSExt = true;
19394 }
19395
19396 SDValue NBV;
19397 SDLoc DL(BV);
19398 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19399 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19400 EVT PreExtendLegalType =
19401 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19403 for (SDValue Op : BV->ops())
19404 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19405 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19406 PreExtendLegalType));
19407 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19408 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19409 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19410 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19411 BV.getOperand(1).isUndef()
19412 ? DAG.getUNDEF(PreExtendVT)
19413 : BV.getOperand(1).getOperand(0),
19414 cast<ShuffleVectorSDNode>(BV)->getMask());
19415 }
19416 unsigned ExtOpc = !SeenZExtOrSExt
19418 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19419 return DAG.getNode(ExtOpc, DL, VT, NBV);
19420}
19421
19422/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19423/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19425 // If the value type isn't a vector, none of the operands are going to be dups
19426 EVT VT = Mul->getValueType(0);
19427 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19428 return SDValue();
19429
19430 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19431 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19432
19433 // Neither operands have been changed, don't make any further changes
19434 if (!Op0 && !Op1)
19435 return SDValue();
19436
19437 SDLoc DL(Mul);
19438 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19439 Op1 ? Op1 : Mul->getOperand(1));
19440}
19441
19442// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19443// Same for other types with equivalent constants.
19445 EVT VT = N->getValueType(0);
19446 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19447 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19448 return SDValue();
19449 if (N->getOperand(0).getOpcode() != ISD::AND ||
19450 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19451 return SDValue();
19452
19453 SDValue And = N->getOperand(0);
19454 SDValue Srl = And.getOperand(0);
19455
19456 APInt V1, V2, V3;
19457 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19458 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19460 return SDValue();
19461
19462 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19463 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19464 V3 != (HalfSize - 1))
19465 return SDValue();
19466
19467 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19468 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19469 VT.getVectorElementCount() * 2);
19470
19471 SDLoc DL(N);
19472 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19473 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19474 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19475 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19476}
19477
19478// Transform vector add(zext i8 to i32, zext i8 to i32)
19479// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19480// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19481// extends.
19483 EVT VT = N->getValueType(0);
19484 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19485 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19486 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19487 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19488 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19489 N->getOperand(0).getOperand(0).getValueType() !=
19490 N->getOperand(1).getOperand(0).getValueType())
19491 return SDValue();
19492
19493 if (N->getOpcode() == ISD::MUL &&
19494 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19495 return SDValue();
19496
19497 SDValue N0 = N->getOperand(0).getOperand(0);
19498 SDValue N1 = N->getOperand(1).getOperand(0);
19499 EVT InVT = N0.getValueType();
19500
19501 EVT S1 = InVT.getScalarType();
19502 EVT S2 = VT.getScalarType();
19503 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19504 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19505 SDLoc DL(N);
19506 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19509 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19510 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19511 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19512 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19513 : (unsigned)ISD::SIGN_EXTEND,
19514 DL, VT, NewOp);
19515 }
19516 return SDValue();
19517}
19518
19521 const AArch64Subtarget *Subtarget) {
19522
19523 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19524 return Ext;
19526 return Ext;
19527 if (SDValue Ext = performVectorExtCombine(N, DAG))
19528 return Ext;
19529
19530 if (DCI.isBeforeLegalizeOps())
19531 return SDValue();
19532
19533 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19534 // and in MachineCombiner pass, add+mul will be combined into madd.
19535 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19536 SDLoc DL(N);
19537 EVT VT = N->getValueType(0);
19538 SDValue N0 = N->getOperand(0);
19539 SDValue N1 = N->getOperand(1);
19540 SDValue MulOper;
19541 unsigned AddSubOpc;
19542
19543 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19544 AddSubOpc = V->getOpcode();
19545 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19546 SDValue Opnd = V->getOperand(1);
19547 MulOper = V->getOperand(0);
19548 if (AddSubOpc == ISD::SUB)
19549 std::swap(Opnd, MulOper);
19550 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19551 return C->isOne();
19552 }
19553 return false;
19554 };
19555
19556 if (IsAddSubWith1(N0)) {
19557 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19558 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19559 }
19560
19561 if (IsAddSubWith1(N1)) {
19562 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19563 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19564 }
19565
19566 // The below optimizations require a constant RHS.
19567 if (!isa<ConstantSDNode>(N1))
19568 return SDValue();
19569
19571 const APInt &ConstValue = C->getAPIntValue();
19572
19573 // Allow the scaling to be folded into the `cnt` instruction by preventing
19574 // the scaling to be obscured here. This makes it easier to pattern match.
19575 if (IsSVECntIntrinsic(N0) ||
19576 (N0->getOpcode() == ISD::TRUNCATE &&
19577 (IsSVECntIntrinsic(N0->getOperand(0)))))
19578 if (ConstValue.sge(1) && ConstValue.sle(16))
19579 return SDValue();
19580
19581 // Multiplication of a power of two plus/minus one can be done more
19582 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19583 // future CPUs have a cheaper MADD instruction, this may need to be
19584 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19585 // 64-bit is 5 cycles, so this is always a win.
19586 // More aggressively, some multiplications N0 * C can be lowered to
19587 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19588 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19589 // TODO: lower more cases.
19590
19591 // TrailingZeroes is used to test if the mul can be lowered to
19592 // shift+add+shift.
19593 unsigned TrailingZeroes = ConstValue.countr_zero();
19594 if (TrailingZeroes) {
19595 // Conservatively do not lower to shift+add+shift if the mul might be
19596 // folded into smul or umul.
19597 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19598 isZeroExtended(N0, DAG)))
19599 return SDValue();
19600 // Conservatively do not lower to shift+add+shift if the mul might be
19601 // folded into madd or msub.
19602 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19603 N->user_begin()->getOpcode() == ISD::SUB))
19604 return SDValue();
19605 }
19606 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19607 // and shift+add+shift.
19608 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19609 unsigned ShiftAmt;
19610
19611 auto Shl = [&](SDValue N0, unsigned N1) {
19612 if (!N0.getNode())
19613 return SDValue();
19614 // If shift causes overflow, ignore this combine.
19615 if (N1 >= N0.getValueSizeInBits())
19616 return SDValue();
19617 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19618 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19619 };
19620 auto Add = [&](SDValue N0, SDValue N1) {
19621 if (!N0.getNode() || !N1.getNode())
19622 return SDValue();
19623 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19624 };
19625 auto Sub = [&](SDValue N0, SDValue N1) {
19626 if (!N0.getNode() || !N1.getNode())
19627 return SDValue();
19628 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19629 };
19630 auto Negate = [&](SDValue N) {
19631 if (!N0.getNode())
19632 return SDValue();
19633 SDValue Zero = DAG.getConstant(0, DL, VT);
19634 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19635 };
19636
19637 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19638 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19639 // the (2^N - 1) can't be execused via a single instruction.
19640 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19641 unsigned BitWidth = C.getBitWidth();
19642 for (unsigned i = 1; i < BitWidth / 2; i++) {
19643 APInt Rem;
19644 APInt X(BitWidth, (1 << i) + 1);
19645 APInt::sdivrem(C, X, N, Rem);
19646 APInt NVMinus1 = N - 1;
19647 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19648 M = X;
19649 return true;
19650 }
19651 }
19652 return false;
19653 };
19654
19655 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19656 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19657 // the (2^N - 1) can't be execused via a single instruction.
19658 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19659 APInt CVMinus1 = C - 1;
19660 if (CVMinus1.isNegative())
19661 return false;
19662 unsigned TrailingZeroes = CVMinus1.countr_zero();
19663 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19664 if (SCVMinus1.isPowerOf2()) {
19665 unsigned BitWidth = SCVMinus1.getBitWidth();
19666 M = APInt(BitWidth, SCVMinus1.logBase2());
19667 N = APInt(BitWidth, TrailingZeroes);
19668 return true;
19669 }
19670 return false;
19671 };
19672
19673 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19674 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19675 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19676 APInt CVMinus1 = C - 1;
19677 if (CVMinus1.isNegative())
19678 return false;
19679 unsigned TrailingZeroes = CVMinus1.countr_zero();
19680 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19681 if (CVPlus1.isPowerOf2()) {
19682 unsigned BitWidth = CVPlus1.getBitWidth();
19683 M = APInt(BitWidth, CVPlus1.logBase2());
19684 N = APInt(BitWidth, TrailingZeroes);
19685 return true;
19686 }
19687 return false;
19688 };
19689
19690 if (ConstValue.isNonNegative()) {
19691 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19692 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19693 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19694 // (mul x, (2^M + 1) * (2^N + 1))
19695 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19696 // (mul x, (2^M + 1) * 2^N + 1))
19697 // => MV = add (shl x, M), x); add (shl MV, N), x)
19698 // (mul x, 1 - (1 - 2^M) * 2^N))
19699 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19700 APInt SCVMinus1 = ShiftedConstValue - 1;
19701 APInt SCVPlus1 = ShiftedConstValue + 1;
19702 APInt CVPlus1 = ConstValue + 1;
19703 APInt CVM, CVN;
19704 if (SCVMinus1.isPowerOf2()) {
19705 ShiftAmt = SCVMinus1.logBase2();
19706 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19707 } else if (CVPlus1.isPowerOf2()) {
19708 ShiftAmt = CVPlus1.logBase2();
19709 return Sub(Shl(N0, ShiftAmt), N0);
19710 } else if (SCVPlus1.isPowerOf2()) {
19711 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19712 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19713 }
19714 if (Subtarget->hasALULSLFast() &&
19715 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19716 APInt CVMMinus1 = CVM - 1;
19717 APInt CVNMinus1 = CVN - 1;
19718 unsigned ShiftM1 = CVMMinus1.logBase2();
19719 unsigned ShiftN1 = CVNMinus1.logBase2();
19720 // ALULSLFast implicate that Shifts <= 4 places are fast
19721 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19722 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19723 return Add(Shl(MVal, ShiftN1), MVal);
19724 }
19725 }
19726 if (Subtarget->hasALULSLFast() &&
19727 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19728 unsigned ShiftM = CVM.getZExtValue();
19729 unsigned ShiftN = CVN.getZExtValue();
19730 // ALULSLFast implicate that Shifts <= 4 places are fast
19731 if (ShiftM <= 4 && ShiftN <= 4) {
19732 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19733 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19734 }
19735 }
19736
19737 if (Subtarget->hasALULSLFast() &&
19738 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19739 unsigned ShiftM = CVM.getZExtValue();
19740 unsigned ShiftN = CVN.getZExtValue();
19741 // ALULSLFast implicate that Shifts <= 4 places are fast
19742 if (ShiftM <= 4 && ShiftN <= 4) {
19743 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19744 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19745 }
19746 }
19747 } else {
19748 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19749 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19750 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19751 APInt SCVPlus1 = -ShiftedConstValue + 1;
19752 APInt CVNegPlus1 = -ConstValue + 1;
19753 APInt CVNegMinus1 = -ConstValue - 1;
19754 if (CVNegPlus1.isPowerOf2()) {
19755 ShiftAmt = CVNegPlus1.logBase2();
19756 return Sub(N0, Shl(N0, ShiftAmt));
19757 } else if (CVNegMinus1.isPowerOf2()) {
19758 ShiftAmt = CVNegMinus1.logBase2();
19759 return Negate(Add(Shl(N0, ShiftAmt), N0));
19760 } else if (SCVPlus1.isPowerOf2()) {
19761 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19762 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19763 }
19764 }
19765
19766 return SDValue();
19767}
19768
19770 SelectionDAG &DAG) {
19771 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19772 // optimize away operation when it's from a constant.
19773 //
19774 // The general transformation is:
19775 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19776 // AND(VECTOR_CMP(x,y), constant2)
19777 // constant2 = UNARYOP(constant)
19778
19779 // Early exit if this isn't a vector operation, the operand of the
19780 // unary operation isn't a bitwise AND, or if the sizes of the operations
19781 // aren't the same.
19782 EVT VT = N->getValueType(0);
19783 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19784 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19785 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19786 return SDValue();
19787
19788 // Now check that the other operand of the AND is a constant. We could
19789 // make the transformation for non-constant splats as well, but it's unclear
19790 // that would be a benefit as it would not eliminate any operations, just
19791 // perform one more step in scalar code before moving to the vector unit.
19792 if (BuildVectorSDNode *BV =
19793 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19794 // Bail out if the vector isn't a constant.
19795 if (!BV->isConstant())
19796 return SDValue();
19797
19798 // Everything checks out. Build up the new and improved node.
19799 SDLoc DL(N);
19800 EVT IntVT = BV->getValueType(0);
19801 // Create a new constant of the appropriate type for the transformed
19802 // DAG.
19803 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19804 // The AND node needs bitcasts to/from an integer vector type around it.
19805 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19806 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19807 N->getOperand(0)->getOperand(0), MaskConst);
19808 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19809 return Res;
19810 }
19811
19812 return SDValue();
19813}
19814
19815/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19816/// functions, this can help to reduce the number of fmovs to/from GPRs.
19817static SDValue
19820 const AArch64Subtarget *Subtarget) {
19821 if (N->isStrictFPOpcode())
19822 return SDValue();
19823
19824 if (DCI.isBeforeLegalizeOps())
19825 return SDValue();
19826
19827 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19828 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19829 return SDValue();
19830
19831 auto isSupportedType = [](EVT VT) {
19832 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19833 };
19834
19835 SDValue SrcVal = N->getOperand(0);
19836 EVT SrcTy = SrcVal.getValueType();
19837 EVT DestTy = N->getValueType(0);
19838
19839 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19840 return SDValue();
19841
19842 EVT SrcVecTy;
19843 EVT DestVecTy;
19844 if (DestTy.bitsGT(SrcTy)) {
19845 DestVecTy = getPackedSVEVectorVT(DestTy);
19846 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19847 } else {
19848 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19849 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19850 }
19851
19852 // Ensure the resulting src/dest vector type is legal.
19853 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19854 return SDValue();
19855
19856 SDLoc DL(N);
19857 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19858 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19859 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19860 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19861 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19862}
19863
19866 const AArch64Subtarget *Subtarget) {
19867 // First try to optimize away the conversion when it's conditionally from
19868 // a constant. Vectors only.
19870 return Res;
19871
19872 if (SDValue Res =
19873 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19874 return Res;
19875
19876 EVT VT = N->getValueType(0);
19877 if (VT != MVT::f32 && VT != MVT::f64)
19878 return SDValue();
19879
19880 // Only optimize when the source and destination types have the same width.
19881 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19882 return SDValue();
19883
19884 // If the result of an integer load is only used by an integer-to-float
19885 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19886 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19887 SDValue N0 = N->getOperand(0);
19888 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19889 N0.hasOneUse() &&
19890 // Do not change the width of a volatile load.
19891 !cast<LoadSDNode>(N0)->isVolatile()) {
19892 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19893 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19894 LN0->getPointerInfo(), LN0->getAlign(),
19895 LN0->getMemOperand()->getFlags());
19896
19897 // Make sure successors of the original load stay after it by updating them
19898 // to use the new Chain.
19899 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19900
19901 unsigned Opcode =
19902 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19903 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19904 }
19905
19906 return SDValue();
19907}
19908
19909/// Fold a floating-point multiply by power of two into floating-point to
19910/// fixed-point conversion.
19913 const AArch64Subtarget *Subtarget) {
19914 if (SDValue Res =
19915 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19916 return Res;
19917
19918 if (!Subtarget->isNeonAvailable())
19919 return SDValue();
19920
19921 if (!N->getValueType(0).isSimple())
19922 return SDValue();
19923
19924 SDValue Op = N->getOperand(0);
19925 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19926 return SDValue();
19927
19928 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19929 return SDValue();
19930
19931 SDValue ConstVec = Op->getOperand(1);
19932 if (!isa<BuildVectorSDNode>(ConstVec))
19933 return SDValue();
19934
19935 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19936 uint32_t FloatBits = FloatTy.getSizeInBits();
19937 if (FloatBits != 32 && FloatBits != 64 &&
19938 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19939 return SDValue();
19940
19941 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19942 uint32_t IntBits = IntTy.getSizeInBits();
19943 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19944 return SDValue();
19945
19946 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19947 if (IntBits > FloatBits)
19948 return SDValue();
19949
19950 BitVector UndefElements;
19952 int32_t Bits = IntBits == 64 ? 64 : 32;
19953 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19954 if (C == -1 || C == 0 || C > Bits)
19955 return SDValue();
19956
19957 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19958 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19959 return SDValue();
19960
19961 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19962 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19963 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19964 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19965 return SDValue();
19966 }
19967
19968 SDLoc DL(N);
19969 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19970 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19971 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19972 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19973 SDValue FixConv =
19975 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19976 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19977 // We can handle smaller integers by generating an extra trunc.
19978 if (IntBits < FloatBits)
19979 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19980
19981 return FixConv;
19982}
19983
19984// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19985// convert to csel(ccmp(.., cc0)), depending on cc1:
19986
19987// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19988// =>
19989// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19990//
19991// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19992// =>
19993// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19995 EVT VT = N->getValueType(0);
19996 SDValue CSel0 = N->getOperand(0);
19997 SDValue CSel1 = N->getOperand(1);
19998
19999 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20000 CSel1.getOpcode() != AArch64ISD::CSEL)
20001 return SDValue();
20002
20003 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20004 return SDValue();
20005
20006 if (!isNullConstant(CSel0.getOperand(0)) ||
20007 !isOneConstant(CSel0.getOperand(1)) ||
20008 !isNullConstant(CSel1.getOperand(0)) ||
20009 !isOneConstant(CSel1.getOperand(1)))
20010 return SDValue();
20011
20012 SDValue Cmp0 = CSel0.getOperand(3);
20013 SDValue Cmp1 = CSel1.getOperand(3);
20016 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
20017 return SDValue();
20018 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
20019 Cmp0.getOpcode() == AArch64ISD::SUBS) {
20020 std::swap(Cmp0, Cmp1);
20021 std::swap(CC0, CC1);
20022 }
20023
20024 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
20025 return SDValue();
20026
20027 SDLoc DL(N);
20028 SDValue CCmp, Condition;
20029 unsigned NZCV;
20030
20031 if (N->getOpcode() == ISD::AND) {
20033 Condition = getCondCode(DAG, InvCC0);
20035 } else {
20037 Condition = getCondCode(DAG, CC0);
20039 }
20040
20041 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
20042
20043 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
20044 if (Op1 && Op1->getAPIntValue().isNegative() &&
20045 Op1->getAPIntValue().sgt(-32)) {
20046 // CCMP accept the constant int the range [0, 31]
20047 // if the Op1 is a constant in the range [-31, -1], we
20048 // can select to CCMN to avoid the extra mov
20049 SDValue AbsOp1 =
20050 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
20051 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
20052 AbsOp1, NZCVOp, Condition, Cmp0);
20053 } else {
20054 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
20055 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
20056 }
20057 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
20058 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
20059}
20060
20062 const AArch64Subtarget *Subtarget,
20063 const AArch64TargetLowering &TLI) {
20064 SelectionDAG &DAG = DCI.DAG;
20065
20066 if (SDValue R = performANDORCSELCombine(N, DAG))
20067 return R;
20068
20069 return SDValue();
20070}
20071
20073 if (!MemVT.getVectorElementType().isSimple())
20074 return false;
20075
20076 uint64_t MaskForTy = 0ull;
20077 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
20078 case MVT::i8:
20079 MaskForTy = 0xffull;
20080 break;
20081 case MVT::i16:
20082 MaskForTy = 0xffffull;
20083 break;
20084 case MVT::i32:
20085 MaskForTy = 0xffffffffull;
20086 break;
20087 default:
20088 return false;
20089 break;
20090 }
20091
20092 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
20093 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20094 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20095
20096 return false;
20097}
20098
20100 SDValue LeafOp = SDValue(N, 0);
20101 SDValue Op = N->getOperand(0);
20102 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20103 LeafOp.getValueType() != Op.getValueType())
20104 Op = Op->getOperand(0);
20105 if (LeafOp.getValueType() == Op.getValueType())
20106 return Op;
20107 return SDValue();
20108}
20109
20112 SelectionDAG &DAG = DCI.DAG;
20113 SDValue Src = N->getOperand(0);
20114 unsigned Opc = Src->getOpcode();
20115
20116 // Zero/any extend of an unsigned unpack
20117 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20118 SDValue UnpkOp = Src->getOperand(0);
20119 SDValue Dup = N->getOperand(1);
20120
20121 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20122 return SDValue();
20123
20124 SDLoc DL(N);
20126 if (!C)
20127 return SDValue();
20128
20129 uint64_t ExtVal = C->getZExtValue();
20130
20131 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20132 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20133 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20134 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20135 };
20136
20137 // If the mask is fully covered by the unpack, we don't need to push
20138 // a new AND onto the operand
20139 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20140 if (MaskAndTypeMatch(EltTy))
20141 return Src;
20142
20143 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20144 // to see if the mask is all-ones of size MemTy.
20145 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20146 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20147 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20148 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20149 if (MaskAndTypeMatch(EltTy))
20150 return Src;
20151 }
20152
20153 // Truncate to prevent a DUP with an over wide constant
20154 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20155
20156 // Otherwise, make sure we propagate the AND to the operand
20157 // of the unpack
20158 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20159 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20160
20161 SDValue And = DAG.getNode(ISD::AND, DL,
20162 UnpkOp->getValueType(0), UnpkOp, Dup);
20163
20164 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20165 }
20166
20167 if (DCI.isBeforeLegalizeOps())
20168 return SDValue();
20169
20170 // If both sides of AND operations are i1 splat_vectors then
20171 // we can produce just i1 splat_vector as the result.
20172 if (isAllActivePredicate(DAG, N->getOperand(0)))
20173 return N->getOperand(1);
20174 if (isAllActivePredicate(DAG, N->getOperand(1)))
20175 return N->getOperand(0);
20176
20178 return SDValue();
20179
20180 SDValue Mask = N->getOperand(1);
20181
20182 if (!Src.hasOneUse())
20183 return SDValue();
20184
20185 EVT MemVT;
20186
20187 // SVE load instructions perform an implicit zero-extend, which makes them
20188 // perfect candidates for combining.
20189 switch (Opc) {
20190 case AArch64ISD::LD1_MERGE_ZERO:
20191 case AArch64ISD::LDNF1_MERGE_ZERO:
20192 case AArch64ISD::LDFF1_MERGE_ZERO:
20193 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20194 break;
20195 case AArch64ISD::GLD1_MERGE_ZERO:
20196 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20197 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20198 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20199 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20200 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20201 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20202 case AArch64ISD::GLDFF1_MERGE_ZERO:
20203 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20204 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20205 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20206 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20207 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20208 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20209 case AArch64ISD::GLDNT1_MERGE_ZERO:
20210 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20211 break;
20212 default:
20213 return SDValue();
20214 }
20215
20216 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20217 return Src;
20218
20219 return SDValue();
20220}
20221
20222// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20225
20226 // This function performs an optimization on a specific pattern involving
20227 // an AND operation and SETCC (Set Condition Code) node.
20228
20229 SDValue SetCC = N->getOperand(0);
20230 EVT VT = N->getValueType(0);
20231 SelectionDAG &DAG = DCI.DAG;
20232
20233 // Checks if the current node (N) is used by any SELECT instruction and
20234 // returns an empty SDValue to avoid applying the optimization to prevent
20235 // incorrect results
20236 for (auto U : N->users())
20237 if (U->getOpcode() == ISD::SELECT)
20238 return SDValue();
20239
20240 // Check if the operand is a SETCC node with floating-point comparison
20241 if (SetCC.getOpcode() == ISD::SETCC &&
20242 SetCC.getOperand(0).getValueType() == MVT::f32) {
20243
20244 SDValue Cmp;
20246
20247 // Check if the DAG is after legalization and if we can emit the conjunction
20248 if (!DCI.isBeforeLegalize() &&
20249 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20250
20252
20253 SDLoc DL(N);
20254 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20255 DAG.getConstant(0, DL, VT),
20256 getCondCode(DAG, InvertedCC), Cmp);
20257 }
20258 }
20259 return SDValue();
20260}
20261
20264 SelectionDAG &DAG = DCI.DAG;
20265 SDValue LHS = N->getOperand(0);
20266 SDValue RHS = N->getOperand(1);
20267 EVT VT = N->getValueType(0);
20268
20269 if (SDValue R = performANDORCSELCombine(N, DAG))
20270 return R;
20271
20272 if (SDValue R = performANDSETCCCombine(N,DCI))
20273 return R;
20274
20275 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20276 return SDValue();
20277
20278 if (VT.isScalableVector())
20279 return performSVEAndCombine(N, DCI);
20280
20281 // The combining code below works only for NEON vectors. In particular, it
20282 // does not work for SVE when dealing with vectors wider than 128 bits.
20283 if (!VT.is64BitVector() && !VT.is128BitVector())
20284 return SDValue();
20285
20287 if (!BVN)
20288 return SDValue();
20289
20290 // AND does not accept an immediate, so check if we can use a BIC immediate
20291 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20292 // pattern in isel, because some immediates may be lowered to the preferred
20293 // (and x, (movi imm)) form, even though an mvni representation also exists.
20294 APInt DefBits(VT.getSizeInBits(), 0);
20295 APInt UndefBits(VT.getSizeInBits(), 0);
20296 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20297 SDValue NewOp;
20298
20299 // Any bits known to already be 0 need not be cleared again, which can help
20300 // reduce the size of the immediate to one supported by the instruction.
20301 KnownBits Known = DAG.computeKnownBits(LHS);
20302 APInt ZeroSplat(VT.getSizeInBits(), 0);
20303 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20304 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20305 << (Known.Zero.getBitWidth() * I);
20306
20307 DefBits = ~(DefBits | ZeroSplat);
20308 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20309 DefBits, &LHS)) ||
20310 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20311 DefBits, &LHS)))
20312 return NewOp;
20313
20314 UndefBits = ~(UndefBits | ZeroSplat);
20315 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20316 UndefBits, &LHS)) ||
20317 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20318 UndefBits, &LHS)))
20319 return NewOp;
20320 }
20321
20322 return SDValue();
20323}
20324
20327 SelectionDAG &DAG = DCI.DAG;
20328 SDValue LHS = N->getOperand(0);
20329 SDValue RHS = N->getOperand(1);
20330 EVT VT = N->getValueType(0);
20331 SDLoc DL(N);
20332
20333 if (!N->getFlags().hasAllowReassociation())
20334 return SDValue();
20335
20336 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20337 auto ReassocComplex = [&](SDValue A, SDValue B) {
20338 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20339 return SDValue();
20340 unsigned Opc = A.getConstantOperandVal(0);
20341 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20342 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20343 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20344 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20345 return SDValue();
20346 SDValue VCMLA = DAG.getNode(
20347 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20348 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20349 A.getOperand(2), A.getOperand(3));
20350 VCMLA->setFlags(A->getFlags());
20351 return VCMLA;
20352 };
20353 if (SDValue R = ReassocComplex(LHS, RHS))
20354 return R;
20355 if (SDValue R = ReassocComplex(RHS, LHS))
20356 return R;
20357
20358 return SDValue();
20359}
20360
20361static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20362 switch (Opcode) {
20363 case ISD::STRICT_FADD:
20364 case ISD::FADD:
20365 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20366 case ISD::ADD:
20367 return VT == MVT::i64;
20368 default:
20369 return false;
20370 }
20371}
20372
20373static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20375
20377 if ((N.getOpcode() == ISD::SETCC) ||
20378 // get_active_lane_mask is lowered to a whilelo instruction.
20379 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20380 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20381 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20382 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20383 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20384 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20385 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20386 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20387 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20388 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20389 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20390 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20391 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20392 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20393 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20394 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20395 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20396 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20397 return true;
20398
20399 return false;
20400}
20401
20402// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20403// ... into: "ptrue p, all" + PTEST
20404static SDValue
20407 const AArch64Subtarget *Subtarget) {
20408 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20409 // Make sure PTEST can be legalised with illegal types.
20410 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20411 return SDValue();
20412
20413 SDValue N0 = N->getOperand(0);
20414 EVT VT = N0.getValueType();
20415
20416 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20417 !isNullConstant(N->getOperand(1)))
20418 return SDValue();
20419
20420 // Restricted the DAG combine to only cases where we're extracting from a
20421 // flag-setting operation.
20422 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20423 return SDValue();
20424
20425 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20426 SelectionDAG &DAG = DCI.DAG;
20427 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20428 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20429}
20430
20431// Materialize : Idx = (add (mul vscale, NumEls), -1)
20432// i1 = extract_vector_elt t37, Constant:i64<Idx>
20433// ... into: "ptrue p, all" + PTEST
20434static SDValue
20437 const AArch64Subtarget *Subtarget) {
20438 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20439 // Make sure PTEST is legal types.
20440 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20441 return SDValue();
20442
20443 SDValue N0 = N->getOperand(0);
20444 EVT OpVT = N0.getValueType();
20445
20446 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20447 return SDValue();
20448
20449 // Idx == (add (mul vscale, NumEls), -1)
20450 SDValue Idx = N->getOperand(1);
20451 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20452 return SDValue();
20453
20454 SDValue VS = Idx.getOperand(0);
20455 if (VS.getOpcode() != ISD::VSCALE)
20456 return SDValue();
20457
20458 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20459 if (VS.getConstantOperandVal(0) != NumEls)
20460 return SDValue();
20461
20462 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20463 SelectionDAG &DAG = DCI.DAG;
20464 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20465 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20466}
20467
20468static SDValue
20470 const AArch64Subtarget *Subtarget) {
20471 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20472 SelectionDAG &DAG = DCI.DAG;
20473 SDValue Vec = N->getOperand(0);
20474 SDValue Idx = N->getOperand(1);
20475
20476 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20477 return SDValue();
20478
20479 // Only legal for 8, 16, 32, and 64 bit element types.
20480 EVT EltVT = Vec.getValueType().getVectorElementType();
20481 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20482 MVT::bf16, MVT::f32, MVT::f64}),
20483 EltVT.getSimpleVT().SimpleTy))
20484 return SDValue();
20485
20486 SDValue Mask = Idx.getOperand(0);
20487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20488 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20489 return SDValue();
20490
20491 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20492 Vec);
20493}
20494
20495static SDValue
20497 const AArch64Subtarget *Subtarget) {
20498 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20499 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20500 return Res;
20501 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20502 return Res;
20503 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20504 return Res;
20505
20506 SelectionDAG &DAG = DCI.DAG;
20507 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20508
20509 EVT VT = N->getValueType(0);
20510 const bool FullFP16 = Subtarget->hasFullFP16();
20511 bool IsStrict = N0->isStrictFPOpcode();
20512
20513 // extract(dup x) -> x
20514 if (N0.getOpcode() == AArch64ISD::DUP)
20515 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20516 : N0.getOperand(0);
20517
20518 // Rewrite for pairwise fadd pattern
20519 // (f32 (extract_vector_elt
20520 // (fadd (vXf32 Other)
20521 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20522 // ->
20523 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20524 // (extract_vector_elt (vXf32 Other) 1))
20525 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20526 // we can only do this when it's used only by the extract_vector_elt.
20527 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20528 (!IsStrict || N0.hasOneUse())) {
20529 SDLoc DL(N0);
20530 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20531 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20532
20534 SDValue Other = N00;
20535
20536 // And handle the commutative case.
20537 if (!Shuffle) {
20538 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20539 Other = N01;
20540 }
20541
20542 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20543 Other == Shuffle->getOperand(0)) {
20544 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20545 DAG.getConstant(0, DL, MVT::i64));
20546 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20547 DAG.getConstant(1, DL, MVT::i64));
20548 if (!IsStrict)
20549 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20550
20551 // For strict_fadd we need uses of the final extract_vector to be replaced
20552 // with the strict_fadd, but we also need uses of the chain output of the
20553 // original strict_fadd to use the chain output of the new strict_fadd as
20554 // otherwise it may not be deleted.
20555 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20556 {VT, MVT::Other},
20557 {N0->getOperand(0), Extract1, Extract2});
20558 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20559 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20560 return SDValue(N, 0);
20561 }
20562 }
20563
20564 // Given an extract(load) or extract(extend(load)), produce a scalar load
20565 // instead to avoid the cross-register-bank copies.
20566 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
20567 VT.isInteger() && isa<ConstantSDNode>(N1)) {
20568 SDValue LoadN0 = N0;
20569 // Look through sext/zext and extract_subvector / insert_subvector if
20570 // required.
20571 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
20572 N0.getOpcode() == ISD::SIGN_EXTEND ||
20573 N0.getOpcode() == ISD::ANY_EXTEND) &&
20574 N0.getOperand(0).hasOneUse())
20575 LoadN0 = N0.getOperand(0);
20576 unsigned OffsetElts = 0;
20577 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
20578 OffsetElts = LoadN0.getConstantOperandVal(1);
20579 LoadN0 = LoadN0.getOperand(0);
20580 }
20581 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
20582 LoadN0.getOperand(0).isUndef() &&
20583 isNullConstant(LoadN0.getOperand(2)) &&
20584 LoadN0.getOperand(1).hasOneUse())
20585 LoadN0 = LoadN0.getOperand(1);
20586
20587 // Check all the uses are valid and can be scalarized. We check that all the
20588 // uses are extracts and those extracts are not re-inserted into an
20589 // operation best treated as a vector register.
20590 auto Load = dyn_cast<LoadSDNode>(LoadN0);
20591 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
20592 Load->getMemoryVT().isByteSized() &&
20593 all_of(N0->uses(), [&](const SDUse &U) {
20594 return U.getResNo() != N0.getResNo() ||
20595 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20596 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
20597 return U2.getUser()->getOpcode() ==
20598 ISD::INSERT_VECTOR_ELT ||
20599 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
20600 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
20601 }));
20602 })) {
20603
20604 SDLoc DL(Load);
20605
20606 // Generate a new scalar load.
20607 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
20608 Load->getValueType(0).getScalarSizeInBits() / 8;
20609 SDValue BasePtr = DAG.getObjectPtrOffset(
20610 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
20611 ISD::LoadExtType ExtType =
20615 : ISD::EXTLOAD);
20616 SDValue ScalarLoad =
20617 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
20618 Load->getPointerInfo().getWithOffset(Offset),
20619 Load->getValueType(0).getScalarType(),
20620 commonAlignment(Load->getAlign(), Offset),
20621 Load->getMemOperand()->getFlags(), Load->getAAInfo());
20622 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
20623 return ScalarLoad;
20624 }
20625 }
20626
20627 return SDValue();
20628}
20629
20632 SelectionDAG &DAG) {
20633 SDLoc DL(N);
20634 EVT VT = N->getValueType(0);
20635 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20636 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20637
20638 if (VT.isScalableVector())
20639 return SDValue();
20640
20641 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20642 N1Opc == ISD::TRUNCATE) {
20643 SDValue N00 = N0->getOperand(0);
20644 SDValue N10 = N1->getOperand(0);
20645 EVT N00VT = N00.getValueType();
20646 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20647
20648 // Optimize concat_vectors of truncated vectors, where the intermediate
20649 // type is illegal, to avoid said illegality, e.g.,
20650 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20651 // (v2i16 (truncate (v2i64)))))
20652 // ->
20653 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20654 // (v4i32 (bitcast (v2i64))),
20655 // <0, 2, 4, 6>)))
20656 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20657 // on both input and result type, so we might generate worse code.
20658 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20659 if (N00VT == N10.getValueType() &&
20660 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20661 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20662 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20664 for (size_t i = 0; i < Mask.size(); ++i)
20665 Mask[i] = i * 2;
20666 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20667 DAG.getVectorShuffle(
20668 MidVT, DL,
20669 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20670 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20671 }
20672
20673 // Optimize two large shifts and a combine into a single combine and shift
20674 // For AArch64 architectures, sequences like the following:
20675 //
20676 // ushr v0.4s, v0.4s, #20
20677 // ushr v1.4s, v1.4s, #20
20678 // uzp1 v0.8h, v0.8h, v1.8h
20679 //
20680 // Can be optimized to:
20681 //
20682 // uzp2 v0.8h, v0.8h, v1.8h
20683 // ushr v0.8h, v0.8h, #4
20684 //
20685 // This optimization reduces instruction count.
20686 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20687 N00->getOperand(1) == N10->getOperand(1)) {
20688 SDValue N000 = N00->getOperand(0);
20689 SDValue N100 = N10->getOperand(0);
20690 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20691 N101ConstVal = N10->getConstantOperandVal(1),
20692 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20693
20694 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20695 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20696 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20697 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20698 SDValue NewShiftConstant =
20699 DAG.getConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20700
20701 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20702 }
20703 }
20704 }
20705
20706 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20707 N->getOperand(0).getValueType() == MVT::v2i16 ||
20708 N->getOperand(0).getValueType() == MVT::v2i8) {
20709 EVT SrcVT = N->getOperand(0).getValueType();
20710 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20711 // loads to prevent having to go through the v4i8 load legalization that
20712 // needs to extend each element into a larger type.
20713 if (N->getNumOperands() % 2 == 0 &&
20714 all_of(N->op_values(), [SrcVT](SDValue V) {
20715 if (V.getValueType() != SrcVT)
20716 return false;
20717 if (V.isUndef())
20718 return true;
20719 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20720 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20721 LD->getExtensionType() == ISD::NON_EXTLOAD;
20722 })) {
20723 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20724 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20726
20727 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20728 SDValue V = N->getOperand(i);
20729 if (V.isUndef())
20730 Ops.push_back(DAG.getUNDEF(FVT));
20731 else {
20733 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20734 LD->getBasePtr(), LD->getMemOperand());
20735 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20736 Ops.push_back(NewLoad);
20737 }
20738 }
20739 return DAG.getBitcast(N->getValueType(0),
20740 DAG.getBuildVector(NVT, DL, Ops));
20741 }
20742 }
20743
20744 // Canonicalise concat_vectors to replace concatenations of truncated nots
20745 // with nots of concatenated truncates. This in some cases allows for multiple
20746 // redundant negations to be eliminated.
20747 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20748 // (v4i16 (truncate (not (v4i32)))))
20749 // ->
20750 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20751 // (v4i16 (truncate (v4i32)))))
20752 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20753 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20754 N->isOnlyUserOf(N1.getNode())) {
20755 auto isBitwiseVectorNegate = [](SDValue V) {
20756 return V->getOpcode() == ISD::XOR &&
20757 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20758 };
20759 SDValue N00 = N0->getOperand(0);
20760 SDValue N10 = N1->getOperand(0);
20761 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20762 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20763 return DAG.getNOT(
20764 DL,
20767 N00->getOperand(0)),
20769 N10->getOperand(0))),
20770 VT);
20771 }
20772 }
20773
20774 // Wait till after everything is legalized to try this. That way we have
20775 // legal vector types and such.
20776 if (DCI.isBeforeLegalizeOps())
20777 return SDValue();
20778
20779 // Optimise concat_vectors of two identical binops with a 128-bit destination
20780 // size, combine into an binop of two contacts of the source vectors. eg:
20781 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20782 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20783 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20784 isVectorizedBinOp(N0Opc)) &&
20785 N0->hasOneUse() && N1->hasOneUse()) {
20786 SDValue N00 = N0->getOperand(0);
20787 SDValue N01 = N0->getOperand(1);
20788 SDValue N10 = N1->getOperand(0);
20789 SDValue N11 = N1->getOperand(1);
20790
20791 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20792 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20793 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20794 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20795 }
20796 }
20797
20798 auto IsRSHRN = [](SDValue Shr) {
20799 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20800 return false;
20801 SDValue Op = Shr.getOperand(0);
20802 EVT VT = Op.getValueType();
20803 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20804 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20805 return false;
20806
20807 APInt Imm;
20808 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20809 Imm = APInt(VT.getScalarSizeInBits(),
20810 Op.getOperand(1).getConstantOperandVal(0)
20811 << Op.getOperand(1).getConstantOperandVal(1));
20812 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20813 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20814 Imm = APInt(VT.getScalarSizeInBits(),
20815 Op.getOperand(1).getConstantOperandVal(0));
20816 else
20817 return false;
20818
20819 if (Imm != 1ULL << (ShtAmt - 1))
20820 return false;
20821 return true;
20822 };
20823
20824 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20825 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20826 ((IsRSHRN(N1) &&
20828 N1.isUndef())) {
20829 SDValue X = N0.getOperand(0).getOperand(0);
20830 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20831 : N1.getOperand(0).getOperand(0);
20832 EVT BVT =
20833 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20834 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20835 SDValue Add = DAG.getNode(
20836 ISD::ADD, DL, BVT, CC,
20837 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20838 SDValue Shr =
20839 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20840 return Shr;
20841 }
20842
20843 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20844 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20845 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20846 N0.getOperand(1) == N1.getOperand(1)) {
20847 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20848 DAG.getUNDEF(N0.getValueType()));
20849 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20850 DAG.getUNDEF(N0.getValueType()));
20851 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20852 }
20853
20854 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20855 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20856 // canonicalise to that.
20857 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20858 assert(VT.getScalarSizeInBits() == 64);
20859 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20860 DAG.getConstant(0, DL, MVT::i64));
20861 }
20862
20863 // Canonicalise concat_vectors so that the right-hand vector has as few
20864 // bit-casts as possible before its real operation. The primary matching
20865 // destination for these operations will be the narrowing "2" instructions,
20866 // which depend on the operation being performed on this right-hand vector.
20867 // For example,
20868 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20869 // becomes
20870 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20871
20872 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20873 return SDValue();
20874 SDValue RHS = N1->getOperand(0);
20875 MVT RHSTy = RHS.getValueType().getSimpleVT();
20876 // If the RHS is not a vector, this is not the pattern we're looking for.
20877 if (!RHSTy.isVector())
20878 return SDValue();
20879
20880 LLVM_DEBUG(
20881 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20882
20883 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20884 RHSTy.getVectorNumElements() * 2);
20885 return DAG.getNode(ISD::BITCAST, DL, VT,
20886 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
20887 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
20888 RHS));
20889}
20890
20891static SDValue
20893 SelectionDAG &DAG) {
20894 if (DCI.isBeforeLegalizeOps())
20895 return SDValue();
20896
20897 EVT VT = N->getValueType(0);
20898 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20899 return SDValue();
20900
20901 SDValue V = N->getOperand(0);
20902
20903 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20904 // blocks this combine because the non-const case requires custom lowering.
20905 //
20906 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20907 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20908 if (isa<ConstantSDNode>(V.getOperand(0)))
20909 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20910
20911 return SDValue();
20912}
20913
20914static SDValue
20916 SelectionDAG &DAG) {
20917 SDLoc DL(N);
20918 SDValue Vec = N->getOperand(0);
20919 SDValue SubVec = N->getOperand(1);
20920 uint64_t IdxVal = N->getConstantOperandVal(2);
20921 EVT VecVT = Vec.getValueType();
20922 EVT SubVT = SubVec.getValueType();
20923
20924 // Promote fixed length vector zeros.
20925 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20926 Vec.isUndef() && isZerosVector(SubVec.getNode()))
20927 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
20928 : DAG.getConstantFP(0, DL, VecVT);
20929
20930 // Only do this for legal fixed vector types.
20931 if (!VecVT.isFixedLengthVector() ||
20932 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20933 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20934 return SDValue();
20935
20936 // Ignore widening patterns.
20937 if (IdxVal == 0 && Vec.isUndef())
20938 return SDValue();
20939
20940 // Subvector must be half the width and an "aligned" insertion.
20941 unsigned NumSubElts = SubVT.getVectorNumElements();
20942 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20943 (IdxVal != 0 && IdxVal != NumSubElts))
20944 return SDValue();
20945
20946 // Fold insert_subvector -> concat_vectors
20947 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20948 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20949 SDValue Lo, Hi;
20950 if (IdxVal == 0) {
20951 Lo = SubVec;
20952 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20953 DAG.getVectorIdxConstant(NumSubElts, DL));
20954 } else {
20955 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20956 DAG.getVectorIdxConstant(0, DL));
20957 Hi = SubVec;
20958 }
20959 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20960}
20961
20964 SelectionDAG &DAG) {
20965 // Wait until after everything is legalized to try this. That way we have
20966 // legal vector types and such.
20967 if (DCI.isBeforeLegalizeOps())
20968 return SDValue();
20969 // Transform a scalar conversion of a value from a lane extract into a
20970 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20971 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20972 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20973 //
20974 // The second form interacts better with instruction selection and the
20975 // register allocator to avoid cross-class register copies that aren't
20976 // coalescable due to a lane reference.
20977
20978 // Check the operand and see if it originates from a lane extract.
20979 SDValue Op1 = N->getOperand(1);
20981 return SDValue();
20982
20983 // Yep, no additional predication needed. Perform the transform.
20984 SDValue IID = N->getOperand(0);
20985 SDValue Shift = N->getOperand(2);
20986 SDValue Vec = Op1.getOperand(0);
20987 SDValue Lane = Op1.getOperand(1);
20988 EVT ResTy = N->getValueType(0);
20989 EVT VecResTy;
20990 SDLoc DL(N);
20991
20992 // The vector width should be 128 bits by the time we get here, even
20993 // if it started as 64 bits (the extract_vector handling will have
20994 // done so). Bail if it is not.
20995 if (Vec.getValueSizeInBits() != 128)
20996 return SDValue();
20997
20998 if (Vec.getValueType() == MVT::v4i32)
20999 VecResTy = MVT::v4f32;
21000 else if (Vec.getValueType() == MVT::v2i64)
21001 VecResTy = MVT::v2f64;
21002 else
21003 return SDValue();
21004
21005 SDValue Convert =
21006 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
21007 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
21008}
21009
21010// AArch64 high-vector "long" operations are formed by performing the non-high
21011// version on an extract_subvector of each operand which gets the high half:
21012//
21013// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
21014//
21015// However, there are cases which don't have an extract_high explicitly, but
21016// have another operation that can be made compatible with one for free. For
21017// example:
21018//
21019// (dupv64 scalar) --> (extract_high (dup128 scalar))
21020//
21021// This routine does the actual conversion of such DUPs, once outer routines
21022// have determined that everything else is in order.
21023// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
21024// similarly here.
21026 MVT VT = N.getSimpleValueType();
21027 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21028 N.getConstantOperandVal(1) == 0)
21029 N = N.getOperand(0);
21030
21031 switch (N.getOpcode()) {
21032 case AArch64ISD::DUP:
21033 case AArch64ISD::DUPLANE8:
21034 case AArch64ISD::DUPLANE16:
21035 case AArch64ISD::DUPLANE32:
21036 case AArch64ISD::DUPLANE64:
21037 case AArch64ISD::MOVI:
21038 case AArch64ISD::MOVIshift:
21039 case AArch64ISD::MOVIedit:
21040 case AArch64ISD::MOVImsl:
21041 case AArch64ISD::MVNIshift:
21042 case AArch64ISD::MVNImsl:
21043 break;
21044 default:
21045 // FMOV could be supported, but isn't very useful, as it would only occur
21046 // if you passed a bitcast' floating point immediate to an eligible long
21047 // integer op (addl, smull, ...).
21048 return SDValue();
21049 }
21050
21051 if (!VT.is64BitVector())
21052 return SDValue();
21053
21054 SDLoc DL(N);
21055 unsigned NumElems = VT.getVectorNumElements();
21056 if (N.getValueType().is64BitVector()) {
21057 MVT ElementTy = VT.getVectorElementType();
21058 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
21059 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
21060 }
21061
21062 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
21063 DAG.getConstant(NumElems, DL, MVT::i64));
21064}
21065
21067 if (N.getOpcode() == ISD::BITCAST)
21068 N = N.getOperand(0);
21069 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21070 return false;
21071 if (N.getOperand(0).getValueType().isScalableVector())
21072 return false;
21073 return N.getConstantOperandAPInt(1) ==
21074 N.getOperand(0).getValueType().getVectorNumElements() / 2;
21075}
21076
21077/// Helper structure to keep track of ISD::SET_CC operands.
21083
21084/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
21089
21090/// Helper structure to keep track of SetCC information.
21095
21096/// Helper structure to be able to read SetCC information. If set to
21097/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
21098/// GenericSetCCInfo.
21103
21104/// Check whether or not \p Op is a SET_CC operation, either a generic or
21105/// an
21106/// AArch64 lowered one.
21107/// \p SetCCInfo is filled accordingly.
21108/// \post SetCCInfo is meanginfull only when this function returns true.
21109/// \return True when Op is a kind of SET_CC operation.
21111 // If this is a setcc, this is straight forward.
21112 if (Op.getOpcode() == ISD::SETCC) {
21113 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
21114 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
21115 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
21116 SetCCInfo.IsAArch64 = false;
21117 return true;
21118 }
21119 // Otherwise, check if this is a matching csel instruction.
21120 // In other words:
21121 // - csel 1, 0, cc
21122 // - csel 0, 1, !cc
21123 if (Op.getOpcode() != AArch64ISD::CSEL)
21124 return false;
21125 // Set the information about the operands.
21126 // TODO: we want the operands of the Cmp not the csel
21127 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
21128 SetCCInfo.IsAArch64 = true;
21129 SetCCInfo.Info.AArch64.CC =
21130 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21131
21132 // Check that the operands matches the constraints:
21133 // (1) Both operands must be constants.
21134 // (2) One must be 1 and the other must be 0.
21135 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
21136 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
21137
21138 // Check (1).
21139 if (!TValue || !FValue)
21140 return false;
21141
21142 // Check (2).
21143 if (!TValue->isOne()) {
21144 // Update the comparison when we are interested in !cc.
21145 std::swap(TValue, FValue);
21146 SetCCInfo.Info.AArch64.CC =
21148 }
21149 return TValue->isOne() && FValue->isZero();
21150}
21151
21152// Returns true if Op is setcc or zext of setcc.
21154 if (isSetCC(Op, Info))
21155 return true;
21156 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
21157 isSetCC(Op->getOperand(0), Info));
21158}
21159
21160// The folding we want to perform is:
21161// (add x, [zext] (setcc cc ...) )
21162// -->
21163// (csel x, (add x, 1), !cc ...)
21164//
21165// The latter will get matched to a CSINC instruction.
21167 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
21168 SDValue LHS = Op->getOperand(0);
21169 SDValue RHS = Op->getOperand(1);
21170 SetCCInfoAndKind InfoAndKind;
21171
21172 // If both operands are a SET_CC, then we don't want to perform this
21173 // folding and create another csel as this results in more instructions
21174 // (and higher register usage).
21175 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
21176 isSetCCOrZExtSetCC(RHS, InfoAndKind))
21177 return SDValue();
21178
21179 // If neither operand is a SET_CC, give up.
21180 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
21181 std::swap(LHS, RHS);
21182 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
21183 return SDValue();
21184 }
21185
21186 // FIXME: This could be generatized to work for FP comparisons.
21187 EVT CmpVT = InfoAndKind.IsAArch64
21188 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
21189 : InfoAndKind.Info.Generic.Opnd0->getValueType();
21190 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
21191 return SDValue();
21192
21193 SDValue CCVal;
21194 SDValue Cmp;
21195 SDLoc DL(Op);
21196 if (InfoAndKind.IsAArch64) {
21197 CCVal = DAG.getConstant(
21199 MVT::i32);
21200 Cmp = *InfoAndKind.Info.AArch64.Cmp;
21201 } else
21202 Cmp = getAArch64Cmp(
21203 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
21204 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
21205 DL);
21206
21207 EVT VT = Op->getValueType(0);
21208 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
21209 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
21210}
21211
21212// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
21214 EVT VT = N->getValueType(0);
21215 // Only scalar integer and vector types.
21216 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
21217 return SDValue();
21218
21219 SDValue LHS = N->getOperand(0);
21220 SDValue RHS = N->getOperand(1);
21221 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21222 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
21223 return SDValue();
21224
21225 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
21226 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
21227 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
21228 return SDValue();
21229
21230 SDValue Op1 = LHS->getOperand(0);
21231 SDValue Op2 = RHS->getOperand(0);
21232 EVT OpVT1 = Op1.getValueType();
21233 EVT OpVT2 = Op2.getValueType();
21234 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
21235 Op2.getOpcode() != AArch64ISD::UADDV ||
21236 OpVT1.getVectorElementType() != VT)
21237 return SDValue();
21238
21239 SDValue Val1 = Op1.getOperand(0);
21240 SDValue Val2 = Op2.getOperand(0);
21241 EVT ValVT = Val1->getValueType(0);
21242 SDLoc DL(N);
21243 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
21244 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21245 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
21246 DAG.getConstant(0, DL, MVT::i64));
21247}
21248
21249/// Perform the scalar expression combine in the form of:
21250/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
21251/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
21253 EVT VT = N->getValueType(0);
21254 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
21255 return SDValue();
21256
21257 SDValue LHS = N->getOperand(0);
21258 SDValue RHS = N->getOperand(1);
21259
21260 // Handle commutivity.
21261 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21262 LHS.getOpcode() != AArch64ISD::CSNEG) {
21263 std::swap(LHS, RHS);
21264 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21265 LHS.getOpcode() != AArch64ISD::CSNEG) {
21266 return SDValue();
21267 }
21268 }
21269
21270 if (!LHS.hasOneUse())
21271 return SDValue();
21272
21274 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
21275
21276 // The CSEL should include a const one operand, and the CSNEG should include
21277 // One or NegOne operand.
21278 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
21279 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
21280 if (!CTVal || !CFVal)
21281 return SDValue();
21282
21283 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
21284 (CTVal->isOne() || CFVal->isOne())) &&
21285 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
21286 (CTVal->isOne() || CFVal->isAllOnes())))
21287 return SDValue();
21288
21289 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
21290 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
21291 !CFVal->isOne()) {
21292 std::swap(CTVal, CFVal);
21294 }
21295
21296 SDLoc DL(N);
21297 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
21298 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
21299 !CFVal->isAllOnes()) {
21300 APInt C = -1 * CFVal->getAPIntValue();
21301 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
21302 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
21304 }
21305
21306 // It might be neutral for larger constants, as the immediate need to be
21307 // materialized in a register.
21308 APInt ADDC = CTVal->getAPIntValue();
21309 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21310 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
21311 return SDValue();
21312
21313 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
21314 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
21315 "Unexpected constant value");
21316
21317 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
21318 SDValue CCVal = getCondCode(DAG, AArch64CC);
21319 SDValue Cmp = LHS.getOperand(3);
21320
21321 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
21322}
21323
21324// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21326 EVT VT = N->getValueType(0);
21327 if (N->getOpcode() != ISD::ADD)
21328 return SDValue();
21329
21330 SDValue Dot = N->getOperand(0);
21331 SDValue A = N->getOperand(1);
21332 // Handle commutivity
21333 auto isZeroDot = [](SDValue Dot) {
21334 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21335 Dot.getOpcode() == AArch64ISD::SDOT) &&
21337 };
21338 if (!isZeroDot(Dot))
21339 std::swap(Dot, A);
21340 if (!isZeroDot(Dot))
21341 return SDValue();
21342
21343 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21344 Dot.getOperand(2));
21345}
21346
21348 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21349}
21350
21351// Try to fold
21352//
21353// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21354//
21355// The folding helps csel to be matched with csneg without generating
21356// redundant neg instruction, which includes negation of the csel expansion
21357// of abs node lowered by lowerABS.
21359 if (!isNegatedInteger(SDValue(N, 0)))
21360 return SDValue();
21361
21362 SDValue CSel = N->getOperand(1);
21363 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21364 return SDValue();
21365
21366 SDValue N0 = CSel.getOperand(0);
21367 SDValue N1 = CSel.getOperand(1);
21368
21369 // If neither of them are negations, it's not worth the folding as it
21370 // introduces two additional negations while reducing one negation.
21371 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21372 return SDValue();
21373
21374 SDLoc DL(N);
21375 EVT VT = CSel.getValueType();
21376
21377 SDValue N0N = DAG.getNegative(N0, DL, VT);
21378 SDValue N1N = DAG.getNegative(N1, DL, VT);
21379
21380 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21381 CSel.getOperand(3));
21382}
21383
21384// The basic add/sub long vector instructions have variants with "2" on the end
21385// which act on the high-half of their inputs. They are normally matched by
21386// patterns like:
21387//
21388// (add (zeroext (extract_high LHS)),
21389// (zeroext (extract_high RHS)))
21390// -> uaddl2 vD, vN, vM
21391//
21392// However, if one of the extracts is something like a duplicate, this
21393// instruction can still be used profitably. This function puts the DAG into a
21394// more appropriate form for those patterns to trigger.
21397 SelectionDAG &DAG = DCI.DAG;
21398 if (DCI.isBeforeLegalizeOps())
21399 return SDValue();
21400
21401 MVT VT = N->getSimpleValueType(0);
21402 if (!VT.is128BitVector()) {
21403 if (N->getOpcode() == ISD::ADD)
21404 return performSetccAddFolding(N, DAG);
21405 return SDValue();
21406 }
21407
21408 // Make sure both branches are extended in the same way.
21409 SDValue LHS = N->getOperand(0);
21410 SDValue RHS = N->getOperand(1);
21411 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21412 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21413 LHS.getOpcode() != RHS.getOpcode())
21414 return SDValue();
21415
21416 unsigned ExtType = LHS.getOpcode();
21417
21418 // It's not worth doing if at least one of the inputs isn't already an
21419 // extract, but we don't know which it'll be so we have to try both.
21420 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
21421 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
21422 if (!RHS.getNode())
21423 return SDValue();
21424
21425 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
21426 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
21427 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
21428 if (!LHS.getNode())
21429 return SDValue();
21430
21431 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
21432 }
21433
21434 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
21435}
21436
21437static bool isCMP(SDValue Op) {
21438 return Op.getOpcode() == AArch64ISD::SUBS &&
21439 !Op.getNode()->hasAnyUseOfValue(0);
21440}
21441
21442// (CSEL 1 0 CC Cond) => CC
21443// (CSEL 0 1 CC Cond) => !CC
21444static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21445 if (Op.getOpcode() != AArch64ISD::CSEL)
21446 return std::nullopt;
21447 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21448 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21449 return std::nullopt;
21450 SDValue OpLHS = Op.getOperand(0);
21451 SDValue OpRHS = Op.getOperand(1);
21452 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21453 return CC;
21454 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21455 return getInvertedCondCode(CC);
21456
21457 return std::nullopt;
21458}
21459
21460// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21461// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21462static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21463 SDValue CmpOp = Op->getOperand(2);
21464 if (!isCMP(CmpOp))
21465 return SDValue();
21466
21467 if (IsAdd) {
21468 if (!isOneConstant(CmpOp.getOperand(1)))
21469 return SDValue();
21470 } else {
21471 if (!isNullConstant(CmpOp.getOperand(0)))
21472 return SDValue();
21473 }
21474
21475 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21476 auto CC = getCSETCondCode(CsetOp);
21477 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21478 return SDValue();
21479
21480 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21481 Op->getOperand(0), Op->getOperand(1),
21482 CsetOp.getOperand(3));
21483}
21484
21485// (ADC x 0 cond) => (CINC x HS cond)
21487 SDValue LHS = N->getOperand(0);
21488 SDValue RHS = N->getOperand(1);
21489 SDValue Cond = N->getOperand(2);
21490
21491 if (!isNullConstant(RHS))
21492 return SDValue();
21493
21494 EVT VT = N->getValueType(0);
21495 SDLoc DL(N);
21496
21497 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21499 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21500}
21501
21504 SelectionDAG &DAG) {
21505 SDLoc DL(N);
21506 EVT VT = N->getValueType(0);
21507
21509 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
21510 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
21511 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
21512 if (Elt0->getOpcode() == ISD::FP_ROUND &&
21513 Elt1->getOpcode() == ISD::FP_ROUND &&
21514 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21515 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21516 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
21518 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21519 // Constant index.
21521 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21522 Elt0->getOperand(0)->getOperand(0) ==
21523 Elt1->getOperand(0)->getOperand(0) &&
21524 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
21525 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
21526 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
21527 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
21528 SDValue HighLanes;
21529 if (Elt2->getOpcode() == ISD::UNDEF &&
21530 Elt3->getOpcode() == ISD::UNDEF) {
21531 HighLanes = DAG.getUNDEF(MVT::v2f32);
21532 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
21533 Elt3->getOpcode() == ISD::FP_ROUND &&
21534 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
21535 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
21536 Elt2->getConstantOperandVal(1) ==
21537 Elt3->getConstantOperandVal(1) &&
21538 Elt2->getOperand(0)->getOpcode() ==
21540 Elt3->getOperand(0)->getOpcode() ==
21542 // Constant index.
21543 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
21544 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
21545 Elt2->getOperand(0)->getOperand(0) ==
21546 Elt3->getOperand(0)->getOperand(0) &&
21547 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
21548 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
21549 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
21550 HighLanes =
21551 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
21552 }
21553 if (HighLanes) {
21554 SDValue DoubleToSingleSticky =
21555 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
21556 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
21557 DoubleToSingleSticky, HighLanes);
21558 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
21559 Elt0->getOperand(1));
21560 }
21561 }
21562 }
21563 }
21564
21565 if (VT == MVT::v2f64) {
21566 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21567 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
21568 Elt1->getOpcode() == ISD::FP_EXTEND &&
21570 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21571 Elt0->getOperand(0)->getOperand(0) ==
21572 Elt1->getOperand(0)->getOperand(0) &&
21573 // Constant index.
21575 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21576 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
21577 Elt1->getOperand(0)->getConstantOperandVal(1) &&
21578 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21579 // ResultType's known minimum vector length.
21580 Elt0->getOperand(0)->getConstantOperandVal(1) %
21582 0) {
21583 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
21584 if (SrcVec.getValueType() == MVT::v4f16 ||
21585 SrcVec.getValueType() == MVT::v4bf16) {
21586 SDValue HalfToSingle =
21587 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
21588 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
21589 SDValue Extract = DAG.getNode(
21591 HalfToSingle, SubvectorIdx);
21592 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
21593 }
21594 }
21595 }
21596
21597 // A build vector of two extracted elements is equivalent to an
21598 // extract subvector where the inner vector is any-extended to the
21599 // extract_vector_elt VT.
21600 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
21601 // (extract_elt_iXX_to_i32 vec Idx+1))
21602 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
21603
21604 // For now, only consider the v2i32 case, which arises as a result of
21605 // legalization.
21606 if (VT != MVT::v2i32)
21607 return SDValue();
21608
21609 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21610 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
21611 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21612 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21613 // Constant index.
21614 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21615 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21616 // Both EXTRACT_VECTOR_ELT from same vector...
21617 Elt0->getOperand(0) == Elt1->getOperand(0) &&
21618 // ... and contiguous. First element's index +1 == second element's index.
21619 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
21620 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21621 // ResultType's known minimum vector length.
21622 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
21623 SDValue VecToExtend = Elt0->getOperand(0);
21624 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
21625 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
21626 return SDValue();
21627
21628 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
21629
21630 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
21631 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
21632 SubvectorIdx);
21633 }
21634
21635 return SDValue();
21636}
21637
21638// A special combine for the sqdmulh family of instructions.
21639// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21640// SATURATING_VAL ) can be reduced to sqdmulh(...)
21642
21643 if (N->getOpcode() != ISD::SMIN)
21644 return SDValue();
21645
21646 EVT DestVT = N->getValueType(0);
21647
21648 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21649 DestVT.isScalableVector())
21650 return SDValue();
21651
21652 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21653
21654 if (!Clamp)
21655 return SDValue();
21656
21657 MVT ScalarType;
21658 unsigned ShiftAmt = 0;
21659 switch (Clamp->getSExtValue()) {
21660 case (1ULL << 15) - 1:
21661 ScalarType = MVT::i16;
21662 ShiftAmt = 16;
21663 break;
21664 case (1ULL << 31) - 1:
21665 ScalarType = MVT::i32;
21666 ShiftAmt = 32;
21667 break;
21668 default:
21669 return SDValue();
21670 }
21671
21672 SDValue Sra = N->getOperand(0);
21673 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21674 return SDValue();
21675
21676 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21677 if (!RightShiftVec)
21678 return SDValue();
21679 unsigned SExtValue = RightShiftVec->getSExtValue();
21680
21681 if (SExtValue != (ShiftAmt - 1))
21682 return SDValue();
21683
21684 SDValue Mul = Sra.getOperand(0);
21685 if (Mul.getOpcode() != ISD::MUL)
21686 return SDValue();
21687
21688 SDValue SExt0 = Mul.getOperand(0);
21689 SDValue SExt1 = Mul.getOperand(1);
21690
21691 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21692 SExt1.getOpcode() != ISD::SIGN_EXTEND)
21693 return SDValue();
21694
21695 EVT SExt0Type = SExt0.getOperand(0).getValueType();
21696 EVT SExt1Type = SExt1.getOperand(0).getValueType();
21697
21698 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21699 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21700 SExt0Type.getVectorNumElements() == 1)
21701 return SDValue();
21702
21703 SDLoc DL(N);
21704 SDValue V0 = SExt0.getOperand(0);
21705 SDValue V1 = SExt1.getOperand(0);
21706
21707 // Ensure input vectors are extended to legal types
21708 if (SExt0Type.getFixedSizeInBits() < 64) {
21709 unsigned VecNumElements = SExt0Type.getVectorNumElements();
21710 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21711 VecNumElements);
21712 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21713 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21714 }
21715
21716 SDValue SQDMULH =
21717 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21718
21719 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21720}
21721
21723 if (SDValue V = trySQDMULHCombine(N, DAG)) {
21724 return V;
21725 }
21726
21727 return SDValue();
21728}
21729
21732 SDLoc DL(N);
21733 EVT VT = N->getValueType(0);
21734 SDValue N0 = N->getOperand(0);
21735 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
21736 N0.getOpcode() == AArch64ISD::DUP) {
21737 SDValue Op = N0.getOperand(0);
21738 if (VT.getScalarType() == MVT::i32 &&
21739 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
21740 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
21741 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
21742 }
21743
21744 // Performing the following combine produces a preferable form for ISEL.
21745 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
21747 N0.hasOneUse()) {
21748 SDValue Op = N0.getOperand(0);
21749 SDValue ExtractIndexNode = N0.getOperand(1);
21750 if (!isa<ConstantSDNode>(ExtractIndexNode))
21751 return SDValue();
21752
21753 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
21754 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
21755 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
21756 "Unexpected legalisation result!");
21757
21758 EVT SrcVectorType = Op.getValueType();
21759 // We also assume that SrcVectorType cannot be a V64 (see
21760 // LowerEXTRACT_VECTOR_ELT).
21761 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
21762 "Unexpected legalisation result!");
21763
21764 unsigned ExtractIndex =
21765 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
21766 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
21767
21768 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
21769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
21770 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
21771 }
21772
21773 return SDValue();
21774}
21775
21776// Check an node is an extend or shift operand
21778 unsigned Opcode = N.getOpcode();
21779 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
21780 EVT SrcVT;
21781 if (Opcode == ISD::SIGN_EXTEND_INREG)
21782 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
21783 else
21784 SrcVT = N.getOperand(0).getValueType();
21785
21786 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
21787 } else if (Opcode == ISD::AND) {
21788 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
21789 if (!CSD)
21790 return false;
21791 uint64_t AndMask = CSD->getZExtValue();
21792 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
21793 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
21794 return isa<ConstantSDNode>(N.getOperand(1));
21795 }
21796
21797 return false;
21798}
21799
21800// (N - Y) + Z --> (Z - Y) + N
21801// when N is an extend or shift operand
21803 SelectionDAG &DAG) {
21804 auto IsOneUseExtend = [](SDValue N) {
21805 return N.hasOneUse() && isExtendOrShiftOperand(N);
21806 };
21807
21808 // DAGCombiner will revert the combination when Z is constant cause
21809 // dead loop. So don't enable the combination when Z is constant.
21810 // If Z is one use shift C, we also can't do the optimization.
21811 // It will falling to self infinite loop.
21812 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
21813 return SDValue();
21814
21815 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21816 return SDValue();
21817
21818 SDValue Shift = SUB.getOperand(0);
21819 if (!IsOneUseExtend(Shift))
21820 return SDValue();
21821
21822 SDLoc DL(N);
21823 EVT VT = N->getValueType(0);
21824
21825 SDValue Y = SUB.getOperand(1);
21826 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
21827 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
21828}
21829
21831 SelectionDAG &DAG) {
21832 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21833 // commutative.
21834 if (N->getOpcode() != ISD::ADD)
21835 return SDValue();
21836
21837 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21838 // shifted register is only available for i32 and i64.
21839 EVT VT = N->getValueType(0);
21840 if (VT != MVT::i32 && VT != MVT::i64)
21841 return SDValue();
21842
21843 SDLoc DL(N);
21844 SDValue LHS = N->getOperand(0);
21845 SDValue RHS = N->getOperand(1);
21846
21847 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
21848 return Val;
21849 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
21850 return Val;
21851
21852 uint64_t LHSImm = 0, RHSImm = 0;
21853 // If both operand are shifted by imm and shift amount is not greater than 4
21854 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21855 // on RHS.
21856 //
21857 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21858 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21859 // with LSL (shift > 4). For the rest of processors, this is no-op for
21860 // performance or correctness.
21861 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21862 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21863 RHSImm > 4 && LHS.hasOneUse())
21864 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21865
21866 return SDValue();
21867}
21868
21869// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21870// This reassociates it back to allow the creation of more mls instructions.
21872 if (N->getOpcode() != ISD::SUB)
21873 return SDValue();
21874
21875 SDValue Add = N->getOperand(1);
21876 SDValue X = N->getOperand(0);
21877 if (Add.getOpcode() != ISD::ADD)
21878 return SDValue();
21879
21880 if (!Add.hasOneUse())
21881 return SDValue();
21883 return SDValue();
21884
21885 SDValue M1 = Add.getOperand(0);
21886 SDValue M2 = Add.getOperand(1);
21887 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21888 M1.getOpcode() != AArch64ISD::UMULL)
21889 return SDValue();
21890 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21891 M2.getOpcode() != AArch64ISD::UMULL)
21892 return SDValue();
21893
21894 EVT VT = N->getValueType(0);
21895 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21896 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21897}
21898
21899// Combine into mla/mls.
21900// This works on the patterns of:
21901// add v1, (mul v2, v3)
21902// sub v1, (mul v2, v3)
21903// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21904// It will transform the add/sub to a scalable version, so that we can
21905// make use of SVE's MLA/MLS that will be generated for that pattern
21906static SDValue
21908 SelectionDAG &DAG = DCI.DAG;
21909 // Make sure that the types are legal
21910 if (!DCI.isAfterLegalizeDAG())
21911 return SDValue();
21912 // Before using SVE's features, check first if it's available.
21913 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21914 return SDValue();
21915
21916 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21917 return SDValue();
21918
21919 if (!N->getValueType(0).isFixedLengthVector())
21920 return SDValue();
21921
21922 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21923 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21924 return SDValue();
21925
21926 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21927 return SDValue();
21928
21929 SDValue MulValue = Op1->getOperand(0);
21930 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21931 return SDValue();
21932
21933 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21934 return SDValue();
21935
21936 EVT ScalableVT = MulValue.getValueType();
21937 if (!ScalableVT.isScalableVector())
21938 return SDValue();
21939
21940 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21941 SDValue NewValue =
21942 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21943 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21944 };
21945
21946 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21947 return res;
21948 else if (N->getOpcode() == ISD::ADD)
21949 return performOpt(N->getOperand(1), N->getOperand(0));
21950
21951 return SDValue();
21952}
21953
21954// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21955// help, for example, to produce ssra from sshr+add.
21957 EVT VT = N->getValueType(0);
21958 if (VT != MVT::i64 ||
21959 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21960 return SDValue();
21961 SDValue Op0 = N->getOperand(0);
21962 SDValue Op1 = N->getOperand(1);
21963
21964 // At least one of the operands should be an extract, and the other should be
21965 // something that is easy to convert to v1i64 type (in this case a load).
21966 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21967 Op0.getOpcode() != ISD::LOAD)
21968 return SDValue();
21969 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21970 Op1.getOpcode() != ISD::LOAD)
21971 return SDValue();
21972
21973 SDLoc DL(N);
21974 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21975 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21976 Op0 = Op0.getOperand(0);
21977 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21978 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21979 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21980 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21981 Op1 = Op1.getOperand(0);
21982 } else
21983 return SDValue();
21984
21985 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21986 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21987 DAG.getConstant(0, DL, MVT::i64));
21988}
21989
21992 if (!BV->hasOneUse())
21993 return false;
21994 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21995 if (!Ld || !Ld->isSimple())
21996 return false;
21997 Loads.push_back(Ld);
21998 return true;
21999 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
22001 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
22002 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
22003 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
22004 return false;
22005 Loads.push_back(Ld);
22006 }
22007 return true;
22008 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
22009 // Try to find a tree of shuffles and concats from how IR shuffles of loads
22010 // are lowered. Note that this only comes up because we do not always visit
22011 // operands before uses. After that is fixed this can be removed and in the
22012 // meantime this is fairly specific to the lowering we expect from IR.
22013 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
22014 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
22015 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
22016 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
22017 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
22018 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
22019 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
22020 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
22021 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
22022 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
22023 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
22024 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22025 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22026 B.getOperand(1).getNumOperands() != 4)
22027 return false;
22028 auto SV1 = cast<ShuffleVectorSDNode>(B);
22029 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
22030 int NumElts = B.getValueType().getVectorNumElements();
22031 int NumSubElts = NumElts / 4;
22032 for (int I = 0; I < NumSubElts; I++) {
22033 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
22034 if (SV1->getMaskElt(I) != I ||
22035 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22036 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
22037 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
22038 return false;
22039 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
22040 if (SV2->getMaskElt(I) != I ||
22041 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22042 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
22043 return false;
22044 }
22045 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
22046 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
22047 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
22048 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
22049 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
22050 !Ld2->isSimple() || !Ld3->isSimple())
22051 return false;
22052 Loads.push_back(Ld0);
22053 Loads.push_back(Ld1);
22054 Loads.push_back(Ld2);
22055 Loads.push_back(Ld3);
22056 return true;
22057 }
22058 return false;
22059}
22060
22062 SelectionDAG &DAG,
22063 unsigned &NumSubLoads) {
22064 if (!Op0.hasOneUse() || !Op1.hasOneUse())
22065 return false;
22066
22067 SmallVector<LoadSDNode *> Loads0, Loads1;
22068 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22069 isLoadOrMultipleLoads(Op1, Loads1)) {
22070 if (NumSubLoads && Loads0.size() != NumSubLoads)
22071 return false;
22072 NumSubLoads = Loads0.size();
22073 return Loads0.size() == Loads1.size() &&
22074 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
22075 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
22076 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
22078 Size / 8, 1);
22079 });
22080 }
22081
22082 if (Op0.getOpcode() != Op1.getOpcode())
22083 return false;
22084
22085 switch (Op0.getOpcode()) {
22086 case ISD::ADD:
22087 case ISD::SUB:
22089 DAG, NumSubLoads) &&
22091 DAG, NumSubLoads);
22092 case ISD::SIGN_EXTEND:
22093 case ISD::ANY_EXTEND:
22094 case ISD::ZERO_EXTEND:
22095 EVT XVT = Op0.getOperand(0).getValueType();
22096 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
22097 XVT.getScalarSizeInBits() != 32)
22098 return false;
22100 DAG, NumSubLoads);
22101 }
22102 return false;
22103}
22104
22105// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
22106// into a single load of twice the size, that we extract the bottom part and top
22107// part so that the shl can use a shll2 instruction. The two loads in that
22108// example can also be larger trees of instructions, which are identical except
22109// for the leaves which are all loads offset from the LHS, including
22110// buildvectors of multiple loads. For example the RHS tree could be
22111// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
22112// Whilst it can be common for the larger loads to replace LDP instructions
22113// (which doesn't gain anything on it's own), the larger loads can help create
22114// more efficient code, and in buildvectors prevent the need for ld1 lane
22115// inserts which can be slower than normal loads.
22117 EVT VT = N->getValueType(0);
22118 if (!VT.isFixedLengthVector() ||
22119 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
22120 VT.getScalarSizeInBits() != 64))
22121 return SDValue();
22122
22123 SDValue Other = N->getOperand(0);
22124 SDValue Shift = N->getOperand(1);
22125 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
22126 std::swap(Shift, Other);
22127 APInt ShiftAmt;
22128 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
22129 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
22130 return SDValue();
22131
22132 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
22133 !ISD::isExtOpcode(Other.getOpcode()) ||
22134 Shift.getOperand(0).getOperand(0).getValueType() !=
22135 Other.getOperand(0).getValueType() ||
22136 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
22137 return SDValue();
22138
22139 SDValue Op0 = Other.getOperand(0);
22140 SDValue Op1 = Shift.getOperand(0).getOperand(0);
22141
22142 unsigned NumSubLoads = 0;
22143 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
22144 return SDValue();
22145
22146 // Attempt to rule out some unprofitable cases using heuristics (some working
22147 // around suboptimal code generation), notably if the extend not be able to
22148 // use ushll2 instructions as the types are not large enough. Otherwise zip's
22149 // will need to be created which can increase the instruction count.
22150 unsigned NumElts = Op0.getValueType().getVectorNumElements();
22151 unsigned NumSubElts = NumElts / NumSubLoads;
22152 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
22153 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
22154 Op0.getValueType().getSizeInBits() < 128 &&
22156 return SDValue();
22157
22158 // Recreate the tree with the new combined loads.
22159 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
22160 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
22161 EVT DVT =
22163
22164 SmallVector<LoadSDNode *> Loads0, Loads1;
22165 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22166 isLoadOrMultipleLoads(Op1, Loads1)) {
22167 EVT LoadVT = EVT::getVectorVT(
22168 *DAG.getContext(), Op0.getValueType().getScalarType(),
22169 Op0.getValueType().getVectorNumElements() / Loads0.size());
22170 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22171
22172 SmallVector<SDValue> NewLoads;
22173 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
22174 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
22175 L0->getBasePtr(), L0->getPointerInfo(),
22176 L0->getBaseAlign());
22177 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
22178 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
22179 NewLoads.push_back(Load);
22180 }
22181 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
22182 }
22183
22185 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
22186 Ops.push_back(GenCombinedTree(O0, O1, DAG));
22187 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
22188 };
22189 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
22190
22191 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
22192 int Hi = NumSubElts, Lo = 0;
22193 for (unsigned i = 0; i < NumSubLoads; i++) {
22194 for (unsigned j = 0; j < NumSubElts; j++) {
22195 LowMask[i * NumSubElts + j] = Lo++;
22196 HighMask[i * NumSubElts + j] = Hi++;
22197 }
22198 Lo += NumSubElts;
22199 Hi += NumSubElts;
22200 }
22201 SDLoc DL(N);
22202 SDValue Ext0, Ext1;
22203 // Extract the top and bottom lanes, then extend the result. Possibly extend
22204 // the result then extract the lanes if the two operands match as it produces
22205 // slightly smaller code.
22206 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
22208 NewOp, DAG.getConstant(0, DL, MVT::i64));
22209 SDValue SubH =
22210 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
22211 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22212 SDValue Extr0 =
22213 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
22214 SDValue Extr1 =
22215 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
22216 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
22217 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
22218 } else {
22220 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
22221 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22222 DAG.getConstant(0, DL, MVT::i64));
22223 SDValue SubH =
22224 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22225 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22226 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
22227 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
22228 }
22229 SDValue NShift =
22230 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
22231 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
22232}
22233
22236 // Try to change sum of two reductions.
22237 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
22238 return Val;
22239 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
22240 return Val;
22241 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
22242 return Val;
22243 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
22244 return Val;
22245 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
22246 return Val;
22248 return Val;
22249 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
22250 return Val;
22251 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
22252 return Val;
22253 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
22254 return Val;
22255
22256 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
22257 return Val;
22258
22259 return performAddSubLongCombine(N, DCI);
22260}
22261
22262// Massage DAGs which we can use the high-half "long" operations on into
22263// something isel will recognize better. E.g.
22264//
22265// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
22266// (aarch64_neon_umull (extract_high (v2i64 vec)))
22267// (extract_high (v2i64 (dup128 scalar)))))
22268//
22271 SelectionDAG &DAG) {
22272 if (DCI.isBeforeLegalizeOps())
22273 return SDValue();
22274
22275 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
22276 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
22277 assert(LHS.getValueType().is64BitVector() &&
22278 RHS.getValueType().is64BitVector() &&
22279 "unexpected shape for long operation");
22280
22281 // Either node could be a DUP, but it's not worth doing both of them (you'd
22282 // just as well use the non-high version) so look for a corresponding extract
22283 // operation on the other "wing".
22286 if (!RHS.getNode())
22287 return SDValue();
22290 if (!LHS.getNode())
22291 return SDValue();
22292 } else
22293 return SDValue();
22294
22295 if (IID == Intrinsic::not_intrinsic)
22296 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
22297
22298 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
22299 N->getOperand(0), LHS, RHS);
22300}
22301
22302static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22303 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
22304 unsigned ElemBits = ElemTy.getSizeInBits();
22305
22306 int64_t ShiftAmount;
22307 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
22308 APInt SplatValue, SplatUndef;
22309 unsigned SplatBitSize;
22310 bool HasAnyUndefs;
22311 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
22312 HasAnyUndefs, ElemBits) ||
22313 SplatBitSize != ElemBits)
22314 return SDValue();
22315
22316 ShiftAmount = SplatValue.getSExtValue();
22317 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22318 ShiftAmount = CVN->getSExtValue();
22319 } else
22320 return SDValue();
22321
22322 // If the shift amount is zero, remove the shift intrinsic.
22323 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22324 return N->getOperand(1);
22325
22326 unsigned Opcode;
22327 bool IsRightShift;
22328 switch (IID) {
22329 default:
22330 llvm_unreachable("Unknown shift intrinsic");
22331 case Intrinsic::aarch64_neon_sqshl:
22332 Opcode = AArch64ISD::SQSHL_I;
22333 IsRightShift = false;
22334 break;
22335 case Intrinsic::aarch64_neon_uqshl:
22336 Opcode = AArch64ISD::UQSHL_I;
22337 IsRightShift = false;
22338 break;
22339 case Intrinsic::aarch64_neon_srshl:
22340 Opcode = AArch64ISD::SRSHR_I;
22341 IsRightShift = true;
22342 break;
22343 case Intrinsic::aarch64_neon_urshl:
22344 Opcode = AArch64ISD::URSHR_I;
22345 IsRightShift = true;
22346 break;
22347 case Intrinsic::aarch64_neon_sqshlu:
22348 Opcode = AArch64ISD::SQSHLU_I;
22349 IsRightShift = false;
22350 break;
22351 case Intrinsic::aarch64_neon_sshl:
22352 case Intrinsic::aarch64_neon_ushl:
22353 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
22354 // left shift for positive shift amounts. For negative shifts we can use a
22355 // VASHR/VLSHR as appropriate.
22356 if (ShiftAmount < 0) {
22357 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
22358 : AArch64ISD::VLSHR;
22359 ShiftAmount = -ShiftAmount;
22360 } else
22361 Opcode = AArch64ISD::VSHL;
22362 IsRightShift = false;
22363 break;
22364 }
22365
22366 EVT VT = N->getValueType(0);
22367 SDValue Op = N->getOperand(1);
22368 SDLoc DL(N);
22369 if (VT == MVT::i64) {
22370 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
22371 VT = MVT::v1i64;
22372 }
22373
22374 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
22375 Op = DAG.getNode(Opcode, DL, VT, Op,
22376 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32));
22377 if (N->getValueType(0) == MVT::i64)
22378 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22379 DAG.getConstant(0, DL, MVT::i64));
22380 return Op;
22381 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
22382 Op = DAG.getNode(Opcode, DL, VT, Op,
22383 DAG.getConstant(ShiftAmount, DL, MVT::i32));
22384 if (N->getValueType(0) == MVT::i64)
22385 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22386 DAG.getConstant(0, DL, MVT::i64));
22387 return Op;
22388 }
22389
22390 return SDValue();
22391}
22392
22393// The CRC32[BH] instructions ignore the high bits of their data operand. Since
22394// the intrinsics must be legal and take an i32, this means there's almost
22395// certainly going to be a zext in the DAG which we can eliminate.
22396static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
22397 SDValue AndN = N->getOperand(2);
22398 if (AndN.getOpcode() != ISD::AND)
22399 return SDValue();
22400
22402 if (!CMask || CMask->getZExtValue() != Mask)
22403 return SDValue();
22404
22405 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
22406 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
22407}
22408
22410 SelectionDAG &DAG) {
22411 SDLoc DL(N);
22412 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
22413 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
22414 N->getOperand(1)),
22415 DAG.getConstant(0, DL, MVT::i64));
22416}
22417
22419 SDLoc DL(N);
22420 SDValue Op1 = N->getOperand(1);
22421 SDValue Op2 = N->getOperand(2);
22422 EVT ScalarTy = Op2.getValueType();
22423 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22424 ScalarTy = MVT::i32;
22425
22426 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
22427 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
22428 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
22429 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
22430 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
22431 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
22432}
22433
22435 SDLoc DL(N);
22436 SDValue Scalar = N->getOperand(3);
22437 EVT ScalarTy = Scalar.getValueType();
22438
22439 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22440 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
22441
22442 SDValue Passthru = N->getOperand(1);
22443 SDValue Pred = N->getOperand(2);
22444 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
22445 Pred, Scalar, Passthru);
22446}
22447
22449 SDLoc DL(N);
22450 LLVMContext &Ctx = *DAG.getContext();
22451 EVT VT = N->getValueType(0);
22452
22453 assert(VT.isScalableVector() && "Expected a scalable vector.");
22454
22455 // Current lowering only supports the SVE-ACLE types.
22457 return SDValue();
22458
22459 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
22460 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
22461 EVT ByteVT =
22462 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
22463
22464 // Convert everything to the domain of EXT (i.e bytes).
22465 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
22466 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
22467 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
22468 DAG.getConstant(ElemSize, DL, MVT::i32));
22469
22470 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
22471 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
22472}
22473
22476 SelectionDAG &DAG) {
22477 if (DCI.isBeforeLegalize())
22478 return SDValue();
22479
22480 SDValue Comparator = N->getOperand(3);
22481 if (Comparator.getOpcode() == AArch64ISD::DUP ||
22482 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
22483 unsigned IID = getIntrinsicID(N);
22484 EVT VT = N->getValueType(0);
22485 EVT CmpVT = N->getOperand(2).getValueType();
22486 SDValue Pred = N->getOperand(1);
22487 SDValue Imm;
22488 SDLoc DL(N);
22489
22490 switch (IID) {
22491 default:
22492 llvm_unreachable("Called with wrong intrinsic!");
22493 break;
22494
22495 // Signed comparisons
22496 case Intrinsic::aarch64_sve_cmpeq_wide:
22497 case Intrinsic::aarch64_sve_cmpne_wide:
22498 case Intrinsic::aarch64_sve_cmpge_wide:
22499 case Intrinsic::aarch64_sve_cmpgt_wide:
22500 case Intrinsic::aarch64_sve_cmplt_wide:
22501 case Intrinsic::aarch64_sve_cmple_wide: {
22502 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22503 int64_t ImmVal = CN->getSExtValue();
22504 if (ImmVal >= -16 && ImmVal <= 15)
22505 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
22506 else
22507 return SDValue();
22508 }
22509 break;
22510 }
22511 // Unsigned comparisons
22512 case Intrinsic::aarch64_sve_cmphs_wide:
22513 case Intrinsic::aarch64_sve_cmphi_wide:
22514 case Intrinsic::aarch64_sve_cmplo_wide:
22515 case Intrinsic::aarch64_sve_cmpls_wide: {
22516 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22517 uint64_t ImmVal = CN->getZExtValue();
22518 if (ImmVal <= 127)
22519 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
22520 else
22521 return SDValue();
22522 }
22523 break;
22524 }
22525 }
22526
22527 if (!Imm)
22528 return SDValue();
22529
22530 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
22531 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
22532 N->getOperand(2), Splat, DAG.getCondCode(CC));
22533 }
22534
22535 return SDValue();
22536}
22537
22540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22541
22542 SDLoc DL(Op);
22543 assert(Op.getValueType().isScalableVector() &&
22544 TLI.isTypeLegal(Op.getValueType()) &&
22545 "Expected legal scalable vector type!");
22546 assert(Op.getValueType() == Pg.getValueType() &&
22547 "Expected same type for PTEST operands");
22548
22549 // Ensure target specific opcodes are using legal type.
22550 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
22551 SDValue TVal = DAG.getConstant(1, DL, OutVT);
22552 SDValue FVal = DAG.getConstant(0, DL, OutVT);
22553
22554 // Ensure operands have type nxv16i1.
22555 if (Op.getValueType() != MVT::nxv16i1) {
22558 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
22559 else
22560 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
22561 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
22562 }
22563
22564 unsigned PTest = AArch64ISD::PTEST;
22566 PTest = AArch64ISD::PTEST_ANY;
22567 else if (Cond == AArch64CC::FIRST_ACTIVE)
22568 PTest = AArch64ISD::PTEST_FIRST;
22569
22570 // Set condition code (CC) flags.
22571 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
22572
22573 // Convert CC to integer based on requested condition.
22574 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
22575 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
22576 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
22577 return DAG.getZExtOrTrunc(Res, DL, VT);
22578}
22579
22581 SelectionDAG &DAG) {
22582 SDLoc DL(N);
22583
22584 SDValue Pred = N->getOperand(1);
22585 SDValue VecToReduce = N->getOperand(2);
22586
22587 // NOTE: The integer reduction's result type is not always linked to the
22588 // operand's element type so we construct it from the intrinsic's result type.
22589 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
22590 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22591
22592 // SVE reductions set the whole vector register with the first element
22593 // containing the reduction result, which we'll now extract.
22594 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22595 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22596 Zero);
22597}
22598
22600 SelectionDAG &DAG) {
22601 SDLoc DL(N);
22602
22603 SDValue Pred = N->getOperand(1);
22604 SDValue VecToReduce = N->getOperand(2);
22605
22606 EVT ReduceVT = VecToReduce.getValueType();
22607 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22608
22609 // SVE reductions set the whole vector register with the first element
22610 // containing the reduction result, which we'll now extract.
22611 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22612 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22613 Zero);
22614}
22615
22617 SelectionDAG &DAG) {
22618 SDLoc DL(N);
22619
22620 SDValue Pred = N->getOperand(1);
22621 SDValue InitVal = N->getOperand(2);
22622 SDValue VecToReduce = N->getOperand(3);
22623 EVT ReduceVT = VecToReduce.getValueType();
22624
22625 // Ordered reductions use the first lane of the result vector as the
22626 // reduction's initial value.
22627 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22628 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
22629 DAG.getUNDEF(ReduceVT), InitVal, Zero);
22630
22631 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
22632
22633 // SVE reductions set the whole vector register with the first element
22634 // containing the reduction result, which we'll now extract.
22635 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22636 Zero);
22637}
22638
22640 SelectionDAG &DAG) {
22641 if (N->getValueType(0) != MVT::i16)
22642 return SDValue();
22643
22644 SDLoc DL(N);
22645 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
22646 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
22647 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
22648}
22649
22650// If a merged operation has no inactive lanes we can relax it to a predicated
22651// or unpredicated operation, which potentially allows better isel (perhaps
22652// using immediate forms) or relaxing register reuse requirements.
22654 SelectionDAG &DAG, bool UnpredOp = false,
22655 bool SwapOperands = false) {
22656 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
22657 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
22658 SDValue Pg = N->getOperand(1);
22659 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
22660 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
22661
22662 // ISD way to specify an all active predicate.
22663 if (isAllActivePredicate(DAG, Pg)) {
22664 if (UnpredOp)
22665 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
22666
22667 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
22668 }
22669
22670 // FUTURE: SplatVector(true)
22671 return SDValue();
22672}
22673
22674static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22675 SDLoc DL(N);
22676 EVT VT = N->getValueType(0);
22677 SDValue Op1 = N->getOperand(1);
22678 SDValue Op2 = N->getOperand(2);
22679 SDValue Op3 = N->getOperand(3);
22680
22681 switch (IID) {
22682 default:
22683 llvm_unreachable("Called with wrong intrinsic!");
22684 case Intrinsic::aarch64_sve_bsl:
22685 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
22686 case Intrinsic::aarch64_sve_bsl1n:
22687 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
22688 Op2);
22689 case Intrinsic::aarch64_sve_bsl2n:
22690 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
22691 DAG.getNOT(DL, Op2, VT));
22692 case Intrinsic::aarch64_sve_nbsl:
22693 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
22694 VT);
22695 }
22696}
22697
22700 const AArch64Subtarget *Subtarget) {
22701 SelectionDAG &DAG = DCI.DAG;
22702 unsigned IID = getIntrinsicID(N);
22703 switch (IID) {
22704 default:
22705 break;
22706 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22707 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22708 return tryCombineFixedPointConvert(N, DCI, DAG);
22709 case Intrinsic::aarch64_neon_saddv:
22710 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
22711 case Intrinsic::aarch64_neon_uaddv:
22712 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
22713 case Intrinsic::aarch64_neon_sminv:
22714 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
22715 case Intrinsic::aarch64_neon_uminv:
22716 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
22717 case Intrinsic::aarch64_neon_smaxv:
22718 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
22719 case Intrinsic::aarch64_neon_umaxv:
22720 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
22721 case Intrinsic::aarch64_neon_fmax:
22722 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22723 N->getOperand(1), N->getOperand(2));
22724 case Intrinsic::aarch64_neon_fmin:
22725 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22726 N->getOperand(1), N->getOperand(2));
22727 case Intrinsic::aarch64_neon_fmaxnm:
22728 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22729 N->getOperand(1), N->getOperand(2));
22730 case Intrinsic::aarch64_neon_fminnm:
22731 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22732 N->getOperand(1), N->getOperand(2));
22733 case Intrinsic::aarch64_neon_smull:
22734 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22735 N->getOperand(1), N->getOperand(2));
22736 case Intrinsic::aarch64_neon_umull:
22737 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22738 N->getOperand(1), N->getOperand(2));
22739 case Intrinsic::aarch64_neon_pmull:
22740 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22741 N->getOperand(1), N->getOperand(2));
22742 case Intrinsic::aarch64_neon_sqdmull:
22743 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22744 case Intrinsic::aarch64_neon_sqshl:
22745 case Intrinsic::aarch64_neon_uqshl:
22746 case Intrinsic::aarch64_neon_sqshlu:
22747 case Intrinsic::aarch64_neon_srshl:
22748 case Intrinsic::aarch64_neon_urshl:
22749 case Intrinsic::aarch64_neon_sshl:
22750 case Intrinsic::aarch64_neon_ushl:
22751 return tryCombineShiftImm(IID, N, DAG);
22752 case Intrinsic::aarch64_neon_sabd:
22753 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22754 N->getOperand(1), N->getOperand(2));
22755 case Intrinsic::aarch64_neon_uabd:
22756 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22757 N->getOperand(1), N->getOperand(2));
22758 case Intrinsic::aarch64_neon_fcvtzs:
22759 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
22760 case Intrinsic::aarch64_neon_fcvtzu:
22761 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
22762 case Intrinsic::aarch64_neon_fcvtas:
22763 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
22764 case Intrinsic::aarch64_neon_fcvtau:
22765 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
22766 case Intrinsic::aarch64_neon_fcvtms:
22767 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
22768 case Intrinsic::aarch64_neon_fcvtmu:
22769 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
22770 case Intrinsic::aarch64_neon_fcvtns:
22771 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
22772 case Intrinsic::aarch64_neon_fcvtnu:
22773 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
22774 case Intrinsic::aarch64_neon_fcvtps:
22775 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
22776 case Intrinsic::aarch64_neon_fcvtpu:
22777 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
22778 case Intrinsic::aarch64_crc32b:
22779 case Intrinsic::aarch64_crc32cb:
22780 return tryCombineCRC32(0xff, N, DAG);
22781 case Intrinsic::aarch64_crc32h:
22782 case Intrinsic::aarch64_crc32ch:
22783 return tryCombineCRC32(0xffff, N, DAG);
22784 case Intrinsic::aarch64_sve_saddv:
22785 // There is no i64 version of SADDV because the sign is irrelevant.
22786 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
22787 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22788 else
22789 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22790 case Intrinsic::aarch64_sve_uaddv:
22791 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22792 case Intrinsic::aarch64_sve_smaxv:
22793 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22794 case Intrinsic::aarch64_sve_umaxv:
22795 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22796 case Intrinsic::aarch64_sve_sminv:
22797 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22798 case Intrinsic::aarch64_sve_uminv:
22799 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22800 case Intrinsic::aarch64_sve_orv:
22801 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22802 case Intrinsic::aarch64_sve_eorv:
22803 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22804 case Intrinsic::aarch64_sve_andv:
22805 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22806 case Intrinsic::aarch64_sve_index:
22807 return LowerSVEIntrinsicIndex(N, DAG);
22808 case Intrinsic::aarch64_sve_dup:
22809 return LowerSVEIntrinsicDUP(N, DAG);
22810 case Intrinsic::aarch64_sve_dup_x:
22811 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22812 N->getOperand(1));
22813 case Intrinsic::aarch64_sve_ext:
22814 return LowerSVEIntrinsicEXT(N, DAG);
22815 case Intrinsic::aarch64_sve_mul_u:
22816 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22817 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22818 case Intrinsic::aarch64_sve_smulh_u:
22819 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22820 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22821 case Intrinsic::aarch64_sve_umulh_u:
22822 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22823 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22824 case Intrinsic::aarch64_sve_smin_u:
22825 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22826 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22827 case Intrinsic::aarch64_sve_umin_u:
22828 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22829 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22830 case Intrinsic::aarch64_sve_smax_u:
22831 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22832 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22833 case Intrinsic::aarch64_sve_umax_u:
22834 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22835 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22836 case Intrinsic::aarch64_sve_lsl_u:
22837 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22838 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22839 case Intrinsic::aarch64_sve_lsr_u:
22840 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22841 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22842 case Intrinsic::aarch64_sve_asr_u:
22843 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22844 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22845 case Intrinsic::aarch64_sve_fadd_u:
22846 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22847 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22848 case Intrinsic::aarch64_sve_fdiv_u:
22849 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22850 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22851 case Intrinsic::aarch64_sve_fmax_u:
22852 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22853 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22854 case Intrinsic::aarch64_sve_fmaxnm_u:
22855 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22856 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22857 case Intrinsic::aarch64_sve_fmla_u:
22858 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22859 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22860 N->getOperand(2));
22861 case Intrinsic::aarch64_sve_fmin_u:
22862 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22863 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22864 case Intrinsic::aarch64_sve_fminnm_u:
22865 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22866 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22867 case Intrinsic::aarch64_sve_fmul_u:
22868 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22869 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22870 case Intrinsic::aarch64_sve_fsub_u:
22871 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22872 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22873 case Intrinsic::aarch64_sve_add_u:
22874 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22875 N->getOperand(3));
22876 case Intrinsic::aarch64_sve_sub_u:
22877 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22878 N->getOperand(3));
22879 case Intrinsic::aarch64_sve_subr:
22880 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22881 case Intrinsic::aarch64_sve_and_u:
22882 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22883 N->getOperand(3));
22884 case Intrinsic::aarch64_sve_bic_u:
22885 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22886 N->getOperand(2), N->getOperand(3));
22887 case Intrinsic::aarch64_sve_saddwb:
22888 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22889 N->getOperand(1), N->getOperand(2));
22890 case Intrinsic::aarch64_sve_saddwt:
22891 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22892 N->getOperand(1), N->getOperand(2));
22893 case Intrinsic::aarch64_sve_uaddwb:
22894 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22895 N->getOperand(1), N->getOperand(2));
22896 case Intrinsic::aarch64_sve_uaddwt:
22897 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22898 N->getOperand(1), N->getOperand(2));
22899 case Intrinsic::aarch64_sve_eor_u:
22900 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22901 N->getOperand(3));
22902 case Intrinsic::aarch64_sve_orr_u:
22903 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22904 N->getOperand(3));
22905 case Intrinsic::aarch64_sve_sabd_u:
22906 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22907 N->getOperand(2), N->getOperand(3));
22908 case Intrinsic::aarch64_sve_uabd_u:
22909 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22910 N->getOperand(2), N->getOperand(3));
22911 case Intrinsic::aarch64_sve_sdiv_u:
22912 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22913 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22914 case Intrinsic::aarch64_sve_udiv_u:
22915 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22916 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22917 case Intrinsic::aarch64_sve_sqadd:
22918 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22919 case Intrinsic::aarch64_sve_sqsub_u:
22920 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22921 N->getOperand(2), N->getOperand(3));
22922 case Intrinsic::aarch64_sve_uqadd:
22923 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22924 case Intrinsic::aarch64_sve_uqsub_u:
22925 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22926 N->getOperand(2), N->getOperand(3));
22927 case Intrinsic::aarch64_sve_sqadd_x:
22928 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22929 N->getOperand(1), N->getOperand(2));
22930 case Intrinsic::aarch64_sve_sqsub_x:
22931 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22932 N->getOperand(1), N->getOperand(2));
22933 case Intrinsic::aarch64_sve_uqadd_x:
22934 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22935 N->getOperand(1), N->getOperand(2));
22936 case Intrinsic::aarch64_sve_uqsub_x:
22937 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22938 N->getOperand(1), N->getOperand(2));
22939 case Intrinsic::aarch64_sve_asrd:
22940 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22941 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22942 case Intrinsic::aarch64_sve_cmphs:
22943 if (!N->getOperand(2).getValueType().isFloatingPoint())
22944 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22945 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22946 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22947 break;
22948 case Intrinsic::aarch64_sve_cmphi:
22949 if (!N->getOperand(2).getValueType().isFloatingPoint())
22950 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22951 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22952 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22953 break;
22954 case Intrinsic::aarch64_sve_fcmpge:
22955 case Intrinsic::aarch64_sve_cmpge:
22956 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22957 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22958 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22959 break;
22960 case Intrinsic::aarch64_sve_fcmpgt:
22961 case Intrinsic::aarch64_sve_cmpgt:
22962 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22963 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22964 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22965 break;
22966 case Intrinsic::aarch64_sve_fcmpeq:
22967 case Intrinsic::aarch64_sve_cmpeq:
22968 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22969 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22970 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22971 break;
22972 case Intrinsic::aarch64_sve_fcmpne:
22973 case Intrinsic::aarch64_sve_cmpne:
22974 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22975 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22976 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22977 break;
22978 case Intrinsic::aarch64_sve_fcmpuo:
22979 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22980 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22981 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22982 break;
22983 case Intrinsic::aarch64_sve_fadda:
22984 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22985 case Intrinsic::aarch64_sve_faddv:
22986 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22987 case Intrinsic::aarch64_sve_fmaxnmv:
22988 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22989 case Intrinsic::aarch64_sve_fmaxv:
22990 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22991 case Intrinsic::aarch64_sve_fminnmv:
22992 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22993 case Intrinsic::aarch64_sve_fminv:
22994 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22995 case Intrinsic::aarch64_sve_sel:
22996 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22997 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22998 case Intrinsic::aarch64_sve_cmpeq_wide:
22999 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
23000 case Intrinsic::aarch64_sve_cmpne_wide:
23001 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
23002 case Intrinsic::aarch64_sve_cmpge_wide:
23003 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
23004 case Intrinsic::aarch64_sve_cmpgt_wide:
23005 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
23006 case Intrinsic::aarch64_sve_cmplt_wide:
23007 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
23008 case Intrinsic::aarch64_sve_cmple_wide:
23009 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
23010 case Intrinsic::aarch64_sve_cmphs_wide:
23011 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
23012 case Intrinsic::aarch64_sve_cmphi_wide:
23013 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
23014 case Intrinsic::aarch64_sve_cmplo_wide:
23015 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
23016 case Intrinsic::aarch64_sve_cmpls_wide:
23017 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
23018 case Intrinsic::aarch64_sve_ptest_any:
23019 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23021 case Intrinsic::aarch64_sve_ptest_first:
23022 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23024 case Intrinsic::aarch64_sve_ptest_last:
23025 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23027 case Intrinsic::aarch64_sve_whilelo:
23028 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
23029 N->getOperand(1), N->getOperand(2));
23030 case Intrinsic::aarch64_sve_bsl:
23031 case Intrinsic::aarch64_sve_bsl1n:
23032 case Intrinsic::aarch64_sve_bsl2n:
23033 case Intrinsic::aarch64_sve_nbsl:
23034 return combineSVEBitSel(IID, N, DAG);
23035 }
23036 return SDValue();
23037}
23038
23039static bool isCheapToExtend(const SDValue &N) {
23040 unsigned OC = N->getOpcode();
23041 return OC == ISD::LOAD || OC == ISD::MLOAD ||
23043}
23044
23045static SDValue
23047 SelectionDAG &DAG) {
23048 // If we have (sext (setcc A B)) and A and B are cheap to extend,
23049 // we can move the sext into the arguments and have the same result. For
23050 // example, if A and B are both loads, we can make those extending loads and
23051 // avoid an extra instruction. This pattern appears often in VLS code
23052 // generation where the inputs to the setcc have a different size to the
23053 // instruction that wants to use the result of the setcc.
23054 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
23055 N->getOperand(0)->getOpcode() == ISD::SETCC);
23056 const SDValue SetCC = N->getOperand(0);
23057
23058 const SDValue CCOp0 = SetCC.getOperand(0);
23059 const SDValue CCOp1 = SetCC.getOperand(1);
23060 if (!CCOp0->getValueType(0).isInteger() ||
23061 !CCOp1->getValueType(0).isInteger())
23062 return SDValue();
23063
23064 ISD::CondCode Code =
23065 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
23066
23067 ISD::NodeType ExtType =
23068 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23069
23070 if (isCheapToExtend(SetCC.getOperand(0)) &&
23071 isCheapToExtend(SetCC.getOperand(1))) {
23072 const SDValue Ext1 =
23073 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
23074 const SDValue Ext2 =
23075 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
23076
23077 return DAG.getSetCC(
23078 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
23079 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
23080 }
23081
23082 return SDValue();
23083}
23084
23085// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
23086// This comes from interleaved vectorization. It is performed late to capture
23087// uitofp converts too.
23089 SelectionDAG &DAG) {
23090 EVT VT = N->getValueType(0);
23091 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
23092 N->getOpcode() != ISD::ZERO_EXTEND ||
23093 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
23094 return SDValue();
23095
23096 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
23097 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23098 return SDValue();
23099
23100 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
23101 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
23102 if (!Shuffle ||
23103 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
23104 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
23105 return SDValue();
23106
23107 unsigned Idx;
23109 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
23110 // An undef interleave shuffle can come up after other canonicalizations,
23111 // where the shuffle has been converted to
23112 // zext(extract(shuffle b, undef, [u,u,0,4]))
23113 bool IsUndefDeInterleave = false;
23114 if (!IsDeInterleave)
23115 IsUndefDeInterleave =
23116 Shuffle->getOperand(1).isUndef() &&
23117 all_of(
23118 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
23119 [](int M) { return M < 0; }) &&
23121 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
23122 VT.getVectorNumElements() / 2),
23123 4, Idx);
23124 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
23125 return SDValue();
23126 SDLoc DL(N);
23127 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23128 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
23129 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23130 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
23131 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
23132 VT, BC1, BC2);
23133 if ((Idx & 1) == 1)
23134 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
23135 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
23136 return DAG.getNode(
23137 ISD::AND, DL, VT, UZP,
23138 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
23139}
23140
23141// This comes up similar to the above when lowering deinterleaving shuffles from
23142// zexts. We have legalized the operations in the generally case to
23143// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
23144// the extract is to the low half and the uzp is uzp1. There would be an extra
23145// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
23146// there could also be an existing and / shift that can be combined in, either
23147// before of after the extract.
23149 EVT VT = N->getValueType(0);
23150 if (N->getOpcode() != ISD::ZERO_EXTEND ||
23151 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
23152 return SDValue();
23153
23154 SDValue Op = N->getOperand(0);
23155 unsigned ExtOffset = (unsigned)-1;
23156 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23157 ExtOffset = Op.getConstantOperandVal(1);
23158 Op = Op.getOperand(0);
23159 }
23160
23161 unsigned Shift = 0;
23163 Op.getValueType().getScalarSizeInBits());
23164
23165 if (Op.getOpcode() == AArch64ISD::VLSHR) {
23166 Shift = Op.getConstantOperandVal(1);
23167 Op = Op.getOperand(0);
23168 Mask = Mask.lshr(Shift);
23169 }
23170 if (Op.getOpcode() == ISD::AND &&
23171 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
23172 Op = Op.getOperand(0);
23173 Mask = Mask.zext(VT.getScalarSizeInBits());
23174 } else if (Op.getOpcode() == AArch64ISD::BICi) {
23175 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
23176 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23177 Mask = Mask.zext(VT.getScalarSizeInBits());
23178 Op = Op.getOperand(0);
23179 }
23180
23181 if (ExtOffset == (unsigned)-1) {
23182 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23183 ExtOffset = Op.getConstantOperandVal(1);
23184 Op = Op.getOperand(0);
23185 } else
23186 return SDValue();
23187 }
23188 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23189 return SDValue();
23190
23191 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23192 return SDValue();
23193 if (Op.getOpcode() == AArch64ISD::UZP2)
23194 Shift += VT.getScalarSizeInBits() / 2;
23195
23196 SDLoc DL(N);
23197 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23198 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23199 if (Shift != 0)
23200 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23201 DAG.getConstant(Shift, DL, MVT::i32));
23202 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23203}
23204
23207 SelectionDAG &DAG) {
23208 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23209 // we can convert that DUP into another extract_high (of a bigger DUP), which
23210 // helps the backend to decide that an sabdl2 would be useful, saving a real
23211 // extract_high operation.
23212 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23213 N->getOperand(0).getValueType().is64BitVector() &&
23214 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23215 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23216 SDNode *ABDNode = N->getOperand(0).getNode();
23217 SDValue NewABD =
23219 if (!NewABD.getNode())
23220 return SDValue();
23221
23222 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23223 }
23224
23226 return R;
23227 if (SDValue R = performZExtUZPCombine(N, DAG))
23228 return R;
23229
23230 if (N->getValueType(0).isFixedLengthVector() &&
23231 N->getOpcode() == ISD::SIGN_EXTEND &&
23232 N->getOperand(0)->getOpcode() == ISD::SETCC)
23233 return performSignExtendSetCCCombine(N, DCI, DAG);
23234
23235 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23236 // that the top half of the result register must be unused, due to the
23237 // any_extend. This means that we can replace this pattern with (rev16
23238 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23239 // ...)), which is what this pattern would otherwise be lowered to.
23240 // Only apply this optimisation if any_extend in original pattern to i32 or
23241 // i64, because this type will become the input type to REV16 in the new
23242 // pattern, so must be a legitimate REV16 input type.
23243 SDValue Bswap = N->getOperand(0);
23244 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23245 Bswap.getValueType() == MVT::i16 &&
23246 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23247 SDLoc DL(N);
23248 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23249 Bswap->getOperand(0));
23250 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23251 NewAnyExtend);
23252 }
23253
23254 return SDValue();
23255}
23256
23258 SDValue SplatVal, unsigned NumVecElts) {
23259 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23260 Align OrigAlignment = St.getAlign();
23261 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23262
23263 // Create scalar stores. This is at least as good as the code sequence for a
23264 // split unaligned store which is a dup.s, ext.b, and two stores.
23265 // Most of the time the three stores should be replaced by store pair
23266 // instructions (stp).
23267 SDLoc DL(&St);
23268 SDValue BasePtr = St.getBasePtr();
23269 uint64_t BaseOffset = 0;
23270
23271 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23272 SDValue NewST1 =
23273 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23274 OrigAlignment, St.getMemOperand()->getFlags());
23275
23276 // As this in ISel, we will not merge this add which may degrade results.
23277 if (BasePtr->getOpcode() == ISD::ADD &&
23278 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23279 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23280 BasePtr = BasePtr->getOperand(0);
23281 }
23282
23283 unsigned Offset = EltOffset;
23284 while (--NumVecElts) {
23285 Align Alignment = commonAlignment(OrigAlignment, Offset);
23286 SDValue OffsetPtr =
23287 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23288 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23289 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23290 PtrInfo.getWithOffset(Offset), Alignment,
23291 St.getMemOperand()->getFlags());
23292 Offset += EltOffset;
23293 }
23294 return NewST1;
23295}
23296
23297// Returns an SVE type that ContentTy can be trivially sign or zero extended
23298// into.
23299static MVT getSVEContainerType(EVT ContentTy) {
23300 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23301
23302 switch (ContentTy.getSimpleVT().SimpleTy) {
23303 default:
23304 llvm_unreachable("No known SVE container for this MVT type");
23305 case MVT::nxv2i8:
23306 case MVT::nxv2i16:
23307 case MVT::nxv2i32:
23308 case MVT::nxv2i64:
23309 case MVT::nxv2f32:
23310 case MVT::nxv2f64:
23311 return MVT::nxv2i64;
23312 case MVT::nxv4i8:
23313 case MVT::nxv4i16:
23314 case MVT::nxv4i32:
23315 case MVT::nxv4f32:
23316 return MVT::nxv4i32;
23317 case MVT::nxv8i8:
23318 case MVT::nxv8i16:
23319 case MVT::nxv8f16:
23320 case MVT::nxv8bf16:
23321 return MVT::nxv8i16;
23322 case MVT::nxv16i8:
23323 return MVT::nxv16i8;
23324 }
23325}
23326
23328 SDLoc DL(N);
23329 EVT VT = N->getValueType(0);
23330
23332 return SDValue();
23333
23334 EVT ContainerVT = VT;
23335 if (ContainerVT.isInteger())
23336 ContainerVT = getSVEContainerType(ContainerVT);
23337
23338 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23339 SDValue Ops[] = { N->getOperand(0), // Chain
23340 N->getOperand(2), // Pg
23341 N->getOperand(3), // Base
23342 DAG.getValueType(VT) };
23343
23344 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23345 SDValue LoadChain = SDValue(Load.getNode(), 1);
23346
23347 if (ContainerVT.isInteger() && (VT != ContainerVT))
23348 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23349
23350 return DAG.getMergeValues({ Load, LoadChain }, DL);
23351}
23352
23354 SDLoc DL(N);
23355 EVT VT = N->getValueType(0);
23356 EVT PtrTy = N->getOperand(3).getValueType();
23357
23358 EVT LoadVT = VT;
23359 if (VT.isFloatingPoint())
23360 LoadVT = VT.changeTypeToInteger();
23361
23362 auto *MINode = cast<MemIntrinsicSDNode>(N);
23363 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23364 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23365 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23366 MINode->getOperand(2), PassThru,
23367 MINode->getMemoryVT(), MINode->getMemOperand(),
23369
23370 if (VT.isFloatingPoint()) {
23371 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23372 return DAG.getMergeValues(Ops, DL);
23373 }
23374
23375 return L;
23376}
23377
23378template <unsigned Opcode>
23380 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23381 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23382 "Unsupported opcode.");
23383 SDLoc DL(N);
23384 EVT VT = N->getValueType(0);
23385
23386 EVT LoadVT = VT;
23387 if (VT.isFloatingPoint())
23388 LoadVT = VT.changeTypeToInteger();
23389
23390 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23391 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23392 SDValue LoadChain = SDValue(Load.getNode(), 1);
23393
23394 if (VT.isFloatingPoint())
23395 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23396
23397 return DAG.getMergeValues({Load, LoadChain}, DL);
23398}
23399
23401 SDLoc DL(N);
23402 SDValue Data = N->getOperand(2);
23403 EVT DataVT = Data.getValueType();
23404 EVT HwSrcVt = getSVEContainerType(DataVT);
23405 SDValue InputVT = DAG.getValueType(DataVT);
23406
23407 if (DataVT.isFloatingPoint())
23408 InputVT = DAG.getValueType(HwSrcVt);
23409
23410 SDValue SrcNew;
23411 if (Data.getValueType().isFloatingPoint())
23412 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23413 else
23414 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23415
23416 SDValue Ops[] = { N->getOperand(0), // Chain
23417 SrcNew,
23418 N->getOperand(4), // Base
23419 N->getOperand(3), // Pg
23420 InputVT
23421 };
23422
23423 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23424}
23425
23427 SDLoc DL(N);
23428
23429 SDValue Data = N->getOperand(2);
23430 EVT DataVT = Data.getValueType();
23431 EVT PtrTy = N->getOperand(4).getValueType();
23432
23433 if (DataVT.isFloatingPoint())
23434 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
23435
23436 auto *MINode = cast<MemIntrinsicSDNode>(N);
23437 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
23438 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
23439 MINode->getMemoryVT(), MINode->getMemOperand(),
23440 ISD::UNINDEXED, false, false);
23441}
23442
23443/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
23444/// load store optimizer pass will merge them to store pair stores. This should
23445/// be better than a movi to create the vector zero followed by a vector store
23446/// if the zero constant is not re-used, since one instructions and one register
23447/// live range will be removed.
23448///
23449/// For example, the final generated code should be:
23450///
23451/// stp xzr, xzr, [x0]
23452///
23453/// instead of:
23454///
23455/// movi v0.2d, #0
23456/// str q0, [x0]
23457///
23459 SDValue StVal = St.getValue();
23460 EVT VT = StVal.getValueType();
23461
23462 // Avoid scalarizing zero splat stores for scalable vectors.
23463 if (VT.isScalableVector())
23464 return SDValue();
23465
23466 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
23467 // 2, 3 or 4 i32 elements.
23468 int NumVecElts = VT.getVectorNumElements();
23469 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
23470 VT.getVectorElementType().getSizeInBits() == 64) ||
23471 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
23472 VT.getVectorElementType().getSizeInBits() == 32)))
23473 return SDValue();
23474
23475 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
23476 return SDValue();
23477
23478 // If the zero constant has more than one use then the vector store could be
23479 // better since the constant mov will be amortized and stp q instructions
23480 // should be able to be formed.
23481 if (!StVal.hasOneUse())
23482 return SDValue();
23483
23484 // If the store is truncating then it's going down to i16 or smaller, which
23485 // means it can be implemented in a single store anyway.
23486 if (St.isTruncatingStore())
23487 return SDValue();
23488
23489 // If the immediate offset of the address operand is too large for the stp
23490 // instruction, then bail out.
23491 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
23492 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
23494 return SDValue();
23495 }
23496
23497 for (int I = 0; I < NumVecElts; ++I) {
23498 SDValue EltVal = StVal.getOperand(I);
23499 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
23500 return SDValue();
23501 }
23502
23503 // Use a CopyFromReg WZR/XZR here to prevent
23504 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
23505 SDLoc DL(&St);
23506 unsigned ZeroReg;
23507 EVT ZeroVT;
23508 if (VT.getVectorElementType().getSizeInBits() == 32) {
23509 ZeroReg = AArch64::WZR;
23510 ZeroVT = MVT::i32;
23511 } else {
23512 ZeroReg = AArch64::XZR;
23513 ZeroVT = MVT::i64;
23514 }
23515 SDValue SplatVal =
23516 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
23517 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23518}
23519
23520/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
23521/// value. The load store optimizer pass will merge them to store pair stores.
23522/// This has better performance than a splat of the scalar followed by a split
23523/// vector store. Even if the stores are not merged it is four stores vs a dup,
23524/// followed by an ext.b and two stores.
23526 SDValue StVal = St.getValue();
23527 EVT VT = StVal.getValueType();
23528
23529 // Don't replace floating point stores, they possibly won't be transformed to
23530 // stp because of the store pair suppress pass.
23531 if (VT.isFloatingPoint())
23532 return SDValue();
23533
23534 // We can express a splat as store pair(s) for 2 or 4 elements.
23535 unsigned NumVecElts = VT.getVectorNumElements();
23536 if (NumVecElts != 4 && NumVecElts != 2)
23537 return SDValue();
23538
23539 // If the store is truncating then it's going down to i16 or smaller, which
23540 // means it can be implemented in a single store anyway.
23541 if (St.isTruncatingStore())
23542 return SDValue();
23543
23544 // Check that this is a splat.
23545 // Make sure that each of the relevant vector element locations are inserted
23546 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
23547 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
23548 SDValue SplatVal;
23549 for (unsigned I = 0; I < NumVecElts; ++I) {
23550 // Check for insert vector elements.
23551 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
23552 return SDValue();
23553
23554 // Check that same value is inserted at each vector element.
23555 if (I == 0)
23556 SplatVal = StVal.getOperand(1);
23557 else if (StVal.getOperand(1) != SplatVal)
23558 return SDValue();
23559
23560 // Check insert element index.
23562 if (!CIndex)
23563 return SDValue();
23564 uint64_t IndexVal = CIndex->getZExtValue();
23565 if (IndexVal >= NumVecElts)
23566 return SDValue();
23567 IndexNotInserted.reset(IndexVal);
23568
23569 StVal = StVal.getOperand(0);
23570 }
23571 // Check that all vector element locations were inserted to.
23572 if (IndexNotInserted.any())
23573 return SDValue();
23574
23575 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23576}
23577
23579 SelectionDAG &DAG,
23580 const AArch64Subtarget *Subtarget) {
23581
23583 if (S->isVolatile() || S->isIndexed())
23584 return SDValue();
23585
23586 SDValue StVal = S->getValue();
23587 EVT VT = StVal.getValueType();
23588
23589 if (!VT.isFixedLengthVector())
23590 return SDValue();
23591
23592 // If we get a splat of zeros, convert this vector store to a store of
23593 // scalars. They will be merged into store pairs of xzr thereby removing one
23594 // instruction and one register.
23595 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
23596 return ReplacedZeroSplat;
23597
23598 // FIXME: The logic for deciding if an unaligned store should be split should
23599 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
23600 // a call to that function here.
23601
23602 if (!Subtarget->isMisaligned128StoreSlow())
23603 return SDValue();
23604
23605 // Don't split at -Oz.
23607 return SDValue();
23608
23609 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
23610 // those up regresses performance on micro-benchmarks and olden/bh.
23611 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
23612 return SDValue();
23613
23614 // Split unaligned 16B stores. They are terrible for performance.
23615 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
23616 // extensions can use this to mark that it does not want splitting to happen
23617 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
23618 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
23619 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
23620 S->getAlign() <= Align(2))
23621 return SDValue();
23622
23623 // If we get a splat of a scalar convert this vector store to a store of
23624 // scalars. They will be merged into store pairs thereby removing two
23625 // instructions.
23626 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
23627 return ReplacedSplat;
23628
23629 SDLoc DL(S);
23630
23631 // Split VT into two.
23632 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23633 unsigned NumElts = HalfVT.getVectorNumElements();
23634 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23635 DAG.getConstant(0, DL, MVT::i64));
23636 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23637 DAG.getConstant(NumElts, DL, MVT::i64));
23638 SDValue BasePtr = S->getBasePtr();
23639 SDValue NewST1 =
23640 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
23641 S->getAlign(), S->getMemOperand()->getFlags());
23642 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23643 DAG.getConstant(8, DL, MVT::i64));
23644 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
23645 S->getPointerInfo(), S->getAlign(),
23646 S->getMemOperand()->getFlags());
23647}
23648
23650 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
23651
23652 // splice(pg, op1, undef) -> op1
23653 if (N->getOperand(2).isUndef())
23654 return N->getOperand(1);
23655
23656 return SDValue();
23657}
23658
23660 const AArch64Subtarget *Subtarget) {
23661 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
23662 N->getOpcode() == AArch64ISD::UUNPKLO) &&
23663 "Unexpected Opcode!");
23664
23665 // uunpklo/hi undef -> undef
23666 if (N->getOperand(0).isUndef())
23667 return DAG.getUNDEF(N->getValueType(0));
23668
23669 // If this is a masked load followed by an UUNPKLO, fold this into a masked
23670 // extending load. We can do this even if this is already a masked
23671 // {z,}extload.
23672 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
23673 N->getOpcode() == AArch64ISD::UUNPKLO) {
23674 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
23675 SDValue Mask = MLD->getMask();
23676 SDLoc DL(N);
23677
23678 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
23679 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23680 (MLD->getPassThru()->isUndef() ||
23681 isZerosVector(MLD->getPassThru().getNode()))) {
23682 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23683 unsigned PgPattern = Mask->getConstantOperandVal(0);
23684 EVT VT = N->getValueType(0);
23685
23686 // Ensure we can double the size of the predicate pattern
23687 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23688 if (NumElts &&
23689 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
23690 Mask =
23691 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
23692 SDValue PassThru = DAG.getConstant(0, DL, VT);
23693 SDValue NewLoad = DAG.getMaskedLoad(
23694 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
23695 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
23697
23698 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
23699
23700 return NewLoad;
23701 }
23702 }
23703 }
23704
23705 return SDValue();
23706}
23707
23709 if (N->getOpcode() != AArch64ISD::UZP1)
23710 return false;
23711 SDValue Op0 = N->getOperand(0);
23712 EVT SrcVT = Op0->getValueType(0);
23713 EVT DstVT = N->getValueType(0);
23714 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23715 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23716 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23717}
23718
23719// Try to combine rounding shifts where the operands come from an extend, and
23720// the result is truncated and combined into one vector.
23721// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23723 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23724 SDValue Op0 = N->getOperand(0);
23725 SDValue Op1 = N->getOperand(1);
23726 EVT ResVT = N->getValueType(0);
23727
23728 unsigned RshOpc = Op0.getOpcode();
23729 if (RshOpc != AArch64ISD::RSHRNB_I)
23730 return SDValue();
23731
23732 // Same op code and imm value?
23733 SDValue ShiftValue = Op0.getOperand(1);
23734 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23735 return SDValue();
23736
23737 // Same unextended operand value?
23738 SDValue Lo = Op0.getOperand(0);
23739 SDValue Hi = Op1.getOperand(0);
23740 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23741 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23742 return SDValue();
23743 SDValue OrigArg = Lo.getOperand(0);
23744 if (OrigArg != Hi.getOperand(0))
23745 return SDValue();
23746
23747 SDLoc DL(N);
23748 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23749 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23750 ShiftValue);
23751}
23752
23753// Try to simplify:
23754// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23755// t2 = nxv8i16 srl(t1, ShiftValue)
23756// to
23757// t1 = nxv8i16 rshrnb(X, shiftvalue).
23758// rshrnb will zero the top half bits of each element. Therefore, this combine
23759// should only be performed when a following instruction with the rshrnb
23760// as an operand does not care about the top half of each element. For example,
23761// a uzp1 or a truncating store.
23763 const AArch64Subtarget *Subtarget) {
23764 EVT VT = Srl->getValueType(0);
23765 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23766 return SDValue();
23767
23768 EVT ResVT;
23769 if (VT == MVT::nxv8i16)
23770 ResVT = MVT::nxv16i8;
23771 else if (VT == MVT::nxv4i32)
23772 ResVT = MVT::nxv8i16;
23773 else if (VT == MVT::nxv2i64)
23774 ResVT = MVT::nxv4i32;
23775 else
23776 return SDValue();
23777
23778 SDLoc DL(Srl);
23779 unsigned ShiftValue;
23780 SDValue RShOperand;
23781 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23782 return SDValue();
23783 SDValue Rshrnb = DAG.getNode(
23784 AArch64ISD::RSHRNB_I, DL, ResVT,
23785 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23786 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23787}
23788
23790 if (V.getOpcode() != AArch64ISD::NVCAST)
23791 return SDValue();
23792
23793 SDValue Op = V.getOperand(0);
23794 if (!Op.getValueType().isVector() ||
23795 V.getValueType().getVectorElementCount() !=
23796 Op.getValueType().getVectorElementCount() * 2)
23797 return SDValue();
23798
23799 return Op;
23800}
23801
23803 const AArch64Subtarget *Subtarget) {
23804 SDLoc DL(N);
23805 SDValue Op0 = N->getOperand(0);
23806 SDValue Op1 = N->getOperand(1);
23807 EVT ResVT = N->getValueType(0);
23808
23809 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23810 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23812 Op0.getOperand(0) == Op1.getOperand(0)) {
23813
23814 SDValue SourceVec = Op0.getOperand(0);
23815 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23816 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23817 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23818 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23819 EVT OpVT = Op0.getOperand(1).getValueType();
23820 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23821 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23822 DAG.getUNDEF(WidenedResVT));
23823 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23824 DAG.getConstant(0, DL, OpVT));
23825 }
23826 }
23827
23828 // Following optimizations only work with uzp1.
23829 if (N->getOpcode() == AArch64ISD::UZP2)
23830 return SDValue();
23831
23832 // uzp1(x, undef) -> concat(truncate(x), undef)
23833 if (Op1.getOpcode() == ISD::UNDEF) {
23834 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23835 switch (ResVT.getSimpleVT().SimpleTy) {
23836 default:
23837 break;
23838 case MVT::v16i8:
23839 BCVT = MVT::v8i16;
23840 HalfVT = MVT::v8i8;
23841 break;
23842 case MVT::v8i16:
23843 BCVT = MVT::v4i32;
23844 HalfVT = MVT::v4i16;
23845 break;
23846 case MVT::v4i32:
23847 BCVT = MVT::v2i64;
23848 HalfVT = MVT::v2i32;
23849 break;
23850 }
23851 if (BCVT != MVT::Other) {
23852 SDValue BC = DAG.getBitcast(BCVT, Op0);
23853 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23854 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23855 DAG.getUNDEF(HalfVT));
23856 }
23857 }
23858
23859 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23860 return Urshr;
23861
23862 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23863 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23864 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23865 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23866 }
23867 }
23868
23869 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23870 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23871 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23872 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23873 }
23874 }
23875
23876 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23877 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23878 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23879 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23880 SDValue X = PreCast.getOperand(0).getOperand(0);
23881 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23882 }
23883 }
23884 }
23885
23886 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23887 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23888 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23889 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23890 SDValue Z = PreCast.getOperand(0).getOperand(1);
23891 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23892 }
23893 }
23894 }
23895
23896 // These optimizations only work on little endian.
23897 if (!DAG.getDataLayout().isLittleEndian())
23898 return SDValue();
23899
23900 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23901 // Example:
23902 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23903 // to
23904 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23906 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23907 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23908 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23909 Op1.getOperand(0));
23910 }
23911 }
23912
23913 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23914 return SDValue();
23915
23916 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23917 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23918
23919 // truncating uzp1(x, y) -> xtn(concat (x, y))
23920 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23921 EVT Op0Ty = SourceOp0.getValueType();
23922 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23923 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23924 SDValue Concat =
23927 SourceOp0, SourceOp1);
23928 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23929 }
23930 }
23931
23932 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23933 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23934 SourceOp1.getOpcode() != ISD::TRUNCATE)
23935 return SDValue();
23936 SourceOp0 = SourceOp0.getOperand(0);
23937 SourceOp1 = SourceOp1.getOperand(0);
23938
23939 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23940 !SourceOp0.getValueType().isSimple())
23941 return SDValue();
23942
23943 EVT ResultTy;
23944
23945 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23946 case MVT::v2i64:
23947 ResultTy = MVT::v4i32;
23948 break;
23949 case MVT::v4i32:
23950 ResultTy = MVT::v8i16;
23951 break;
23952 case MVT::v8i16:
23953 ResultTy = MVT::v16i8;
23954 break;
23955 default:
23956 return SDValue();
23957 }
23958
23959 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23960 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23961 SDValue UzpResult =
23962 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23963
23964 EVT BitcastResultTy;
23965
23966 switch (ResVT.getSimpleVT().SimpleTy) {
23967 case MVT::v2i32:
23968 BitcastResultTy = MVT::v2i64;
23969 break;
23970 case MVT::v4i16:
23971 BitcastResultTy = MVT::v4i32;
23972 break;
23973 case MVT::v8i8:
23974 BitcastResultTy = MVT::v8i16;
23975 break;
23976 default:
23977 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23978 }
23979
23980 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23981 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23982}
23983
23985 unsigned Opc = N->getOpcode();
23986
23987 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23988 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23989 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23990 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23991 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23992 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23993 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23994 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23995
23996 SDLoc DL(N);
23997 SDValue Chain = N->getOperand(0);
23998 SDValue Pg = N->getOperand(1);
23999 SDValue Base = N->getOperand(2);
24000 SDValue Offset = N->getOperand(3);
24001 SDValue Ty = N->getOperand(4);
24002
24003 EVT ResVT = N->getValueType(0);
24004
24005 const auto OffsetOpc = Offset.getOpcode();
24006 const bool OffsetIsZExt =
24007 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
24008 const bool OffsetIsSExt =
24009 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
24010
24011 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
24012 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
24013 SDValue ExtPg = Offset.getOperand(0);
24014 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
24015 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
24016
24017 // If the predicate for the sign- or zero-extended offset is the
24018 // same as the predicate used for this load and the sign-/zero-extension
24019 // was from a 32-bits...
24020 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
24021 SDValue UnextendedOffset = Offset.getOperand(1);
24022
24023 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
24024 if (Signed)
24025 NewOpc = getSignExtendedGatherOpcode(NewOpc);
24026
24027 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
24028 {Chain, Pg, Base, UnextendedOffset, Ty});
24029 }
24030 }
24031
24032 return SDValue();
24033}
24034
24035/// Optimize a vector shift instruction and its operand if shifted out
24036/// bits are not used.
24038 const AArch64TargetLowering &TLI,
24040 assert(N->getOpcode() == AArch64ISD::VASHR ||
24041 N->getOpcode() == AArch64ISD::VLSHR);
24042
24043 SDValue Op = N->getOperand(0);
24044 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
24045
24046 unsigned ShiftImm = N->getConstantOperandVal(1);
24047 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
24048
24049 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
24050 if (N->getOpcode() == AArch64ISD::VASHR &&
24051 Op.getOpcode() == AArch64ISD::VSHL &&
24052 N->getOperand(1) == Op.getOperand(1))
24053 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
24054 return Op.getOperand(0);
24055
24056 // If the shift is exact, the shifted out bits matter.
24057 if (N->getFlags().hasExact())
24058 return SDValue();
24059
24060 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
24061 APInt DemandedMask = ~ShiftedOutBits;
24062
24063 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
24064 return SDValue(N, 0);
24065
24066 return SDValue();
24067}
24068
24070 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
24071 // This transform works in partnership with performSetCCPunpkCombine to
24072 // remove unnecessary transfer of predicates into standard registers and back
24073 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
24074 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
24075 MVT::i1) {
24076 SDValue CC = N->getOperand(0)->getOperand(0);
24077 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
24078 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
24079 DAG.getVectorIdxConstant(0, SDLoc(N)));
24080 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
24081 }
24082
24083 return SDValue();
24084}
24085
24086/// Target-specific DAG combine function for post-increment LD1 (lane) and
24087/// post-increment LD1R.
24090 bool IsLaneOp) {
24091 if (DCI.isBeforeLegalizeOps())
24092 return SDValue();
24093
24094 SelectionDAG &DAG = DCI.DAG;
24095 EVT VT = N->getValueType(0);
24096
24097 if (!VT.is128BitVector() && !VT.is64BitVector())
24098 return SDValue();
24099
24100 // If it is not LOAD, can not do such combine.
24101 unsigned LoadIdx = IsLaneOp ? 1 : 0;
24102 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
24103 if (!LD)
24104 return SDValue();
24105
24106 // If the Generic combiner already helped form a pre- or post-indexed load,
24107 // skip forming one here.
24108 if (LD->isIndexed())
24109 return SDValue();
24110
24111 // The vector lane must be a constant in the LD1LANE opcode.
24112 SDValue Lane;
24113 if (IsLaneOp) {
24114 Lane = N->getOperand(2);
24115 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
24116 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
24117 return SDValue();
24118 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
24119 return SDValue();
24120 }
24121
24122 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
24123 EVT MemVT = LoadSDN->getMemoryVT();
24124 // Check if memory operand is the same type as the vector element.
24125 if (MemVT != VT.getVectorElementType())
24126 return SDValue();
24127
24128 // Check if there are other uses. If so, do not combine as it will introduce
24129 // an extra load.
24130 for (SDUse &U : LD->uses()) {
24131 if (U.getResNo() == 1) // Ignore uses of the chain result.
24132 continue;
24133 if (U.getUser() != N)
24134 return SDValue();
24135 }
24136
24137 // If there is one use and it can splat the value, prefer that operation.
24138 // TODO: This could be expanded to more operations if they reliably use the
24139 // index variants.
24140 if (N->hasOneUse()) {
24141 unsigned UseOpc = N->user_begin()->getOpcode();
24142 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
24143 return SDValue();
24144 }
24145
24146 SDValue Addr = LD->getOperand(1);
24147 SDValue Vector = N->getOperand(0);
24148 // Search for a use of the address operand that is an increment.
24149 for (SDUse &Use : Addr->uses()) {
24150 SDNode *User = Use.getUser();
24151 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24152 continue;
24153
24154 // If the increment is a constant, it must match the memory ref size.
24155 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24156 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24157 uint32_t IncVal = CInc->getZExtValue();
24158 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
24159 if (IncVal != NumBytes)
24160 continue;
24161 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24162 }
24163
24164 // To avoid cycle construction make sure that neither the load nor the add
24165 // are predecessors to each other or the Vector.
24168 Visited.insert(Addr.getNode());
24169 Worklist.push_back(User);
24170 Worklist.push_back(LD);
24171 Worklist.push_back(Vector.getNode());
24172 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
24173 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24174 continue;
24175
24177 Ops.push_back(LD->getOperand(0)); // Chain
24178 if (IsLaneOp) {
24179 Ops.push_back(Vector); // The vector to be inserted
24180 Ops.push_back(Lane); // The lane to be inserted in the vector
24181 }
24182 Ops.push_back(Addr);
24183 Ops.push_back(Inc);
24184
24185 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24186 SDVTList SDTys = DAG.getVTList(Tys);
24187 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24188 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24189 MemVT,
24190 LoadSDN->getMemOperand());
24191
24192 // Update the uses.
24193 SDValue NewResults[] = {
24194 SDValue(LD, 0), // The result of load
24195 SDValue(UpdN.getNode(), 2) // Chain
24196 };
24197 DCI.CombineTo(LD, NewResults);
24198 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24199 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24200
24201 break;
24202 }
24203 return SDValue();
24204}
24205
24206/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24207/// address translation.
24208static bool performTBISimplification(SDValue Addr,
24210 SelectionDAG &DAG) {
24211 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
24212 KnownBits Known;
24214 !DCI.isBeforeLegalizeOps());
24215 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24216 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24217 DCI.CommitTargetLoweringOpt(TLO);
24218 return true;
24219 }
24220 return false;
24221}
24222
24223static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
24224 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24225 "Expected STORE dag node in input!");
24226
24227 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24228 if (!Store->isTruncatingStore() || Store->isIndexed())
24229 return SDValue();
24230 SDValue Ext = Store->getValue();
24231 auto ExtOpCode = Ext.getOpcode();
24232 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24233 ExtOpCode != ISD::ANY_EXTEND)
24234 return SDValue();
24235 SDValue Orig = Ext->getOperand(0);
24236 if (Store->getMemoryVT() != Orig.getValueType())
24237 return SDValue();
24238 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24239 Store->getBasePtr(), Store->getMemOperand());
24240 }
24241
24242 return SDValue();
24243}
24244
24245// A custom combine to lower load <3 x i8> as the more efficient sequence
24246// below:
24247// ldrb wX, [x0, #2]
24248// ldrh wY, [x0]
24249// orr wX, wY, wX, lsl #16
24250// fmov s0, wX
24251//
24252// Note that an alternative sequence with even fewer (although usually more
24253// complex/expensive) instructions would be:
24254// ld1r.4h { v0 }, [x0], #2
24255// ld1.b { v0 }[2], [x0]
24256//
24257// Generating this sequence unfortunately results in noticeably worse codegen
24258// for code that extends the loaded v3i8, due to legalization breaking vector
24259// shuffle detection in a way that is very difficult to work around.
24260// TODO: Revisit once v3i8 legalization has been improved in general.
24261static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
24262 EVT MemVT = LD->getMemoryVT();
24263 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24264 LD->getBaseAlign() >= 4)
24265 return SDValue();
24266
24267 SDLoc DL(LD);
24269 SDValue Chain = LD->getChain();
24270 SDValue BasePtr = LD->getBasePtr();
24271 MachineMemOperand *MMO = LD->getMemOperand();
24272 assert(LD->getOffset().isUndef() && "undef offset expected");
24273
24274 // Load 2 x i8, then 1 x i8.
24275 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24276 TypeSize Offset2 = TypeSize::getFixed(2);
24277 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24278 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24279 MF.getMachineMemOperand(MMO, 2, 1));
24280
24281 // Extend to i32.
24282 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24283 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24284
24285 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24286 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24287 DAG.getConstant(16, DL, MVT::i32));
24288 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24289 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24290
24291 // Extract v3i8 again.
24292 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24293 DAG.getConstant(0, DL, MVT::i64));
24295 ISD::TokenFactor, DL, MVT::Other,
24296 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24297 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24298}
24299
24300// Perform TBI simplification if supported by the target and try to break up
24301// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24302// load instructions can be selected.
24303static SDValue performLOADCombine(SDNode *N,
24305 SelectionDAG &DAG,
24306 const AArch64Subtarget *Subtarget) {
24307 if (Subtarget->supportsAddressTopByteIgnored())
24308 performTBISimplification(N->getOperand(1), DCI, DAG);
24309
24311 EVT RegVT = LD->getValueType(0);
24312 EVT MemVT = LD->getMemoryVT();
24313 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24314 SDLoc DL(LD);
24315
24316 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24317 unsigned AddrSpace = LD->getAddressSpace();
24318 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24319 AddrSpace == ARM64AS::PTR32_UPTR) {
24320 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24321 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24322 SDValue Cast =
24323 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24324 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24325 Cast, LD->getPointerInfo(), MemVT,
24326 LD->getBaseAlign(),
24327 LD->getMemOperand()->getFlags());
24328 }
24329 }
24330
24331 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24332 return SDValue(N, 0);
24333
24334 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24335 return Res;
24336
24337 if (!LD->isNonTemporal())
24338 return SDValue(N, 0);
24339
24340 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24341 MemVT.getSizeInBits() % 256 == 0 ||
24342 256 % MemVT.getScalarSizeInBits() != 0)
24343 return SDValue(N, 0);
24344
24345 SDValue Chain = LD->getChain();
24346 SDValue BasePtr = LD->getBasePtr();
24347 SDNodeFlags Flags = LD->getFlags();
24349 SmallVector<SDValue, 4> LoadOpsChain;
24350 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24351 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24352 // loads and reduce the amount of load instructions generated.
24353 MVT NewVT =
24355 256 / MemVT.getVectorElementType().getSizeInBits());
24356 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24357 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24358 for (unsigned I = 0; I < Num256Loads; I++) {
24359 unsigned PtrOffset = I * 32;
24360 SDValue NewPtr = DAG.getMemBasePlusOffset(
24361 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24362 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24363 SDValue NewLoad = DAG.getLoad(
24364 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24365 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24366 LoadOps.push_back(NewLoad);
24367 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24368 }
24369
24370 // Process remaining bits of the load operation.
24371 // This is done by creating an UNDEF vector to match the size of the
24372 // 256-bit loads and inserting the remaining load to it. We extract the
24373 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24374 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24375 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24376 MVT RemainingVT = MVT::getVectorVT(
24378 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24379 SDValue NewPtr = DAG.getMemBasePlusOffset(
24380 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24381 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24382 SDValue RemainingLoad =
24383 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24384 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24385 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24386 SDValue UndefVector = DAG.getUNDEF(NewVT);
24387 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24388 SDValue ExtendedRemainingLoad =
24389 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24390 {UndefVector, RemainingLoad, InsertIdx});
24391 LoadOps.push_back(ExtendedRemainingLoad);
24392 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24393 EVT ConcatVT =
24395 LoadOps.size() * NewVT.getVectorNumElements());
24396 SDValue ConcatVectors =
24397 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24398 // Extract the original vector type size.
24399 SDValue ExtractSubVector =
24400 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24401 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24403 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24404 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24405}
24406
24407static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
24408 EVT VecVT = Op.getValueType();
24409 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24410 "Need boolean vector type.");
24411
24412 if (Depth > 3)
24414
24415 // We can get the base type from a vector compare or truncate.
24416 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24417 return Op.getOperand(0).getValueType();
24418
24419 // If an operand is a bool vector, continue looking.
24421 for (SDValue Operand : Op->op_values()) {
24422 if (Operand.getValueType() != VecVT)
24423 continue;
24424
24425 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24426 if (!BaseVT.isSimple())
24427 BaseVT = OperandVT;
24428 else if (OperandVT != BaseVT)
24430 }
24431
24432 return BaseVT;
24433}
24434
24435// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
24436// iN, we can use a trick that extracts the i^th bit from the i^th element and
24437// then performs a vector add to get a scalar bitmask. This requires that each
24438// element's bits are either all 1 or all 0.
24439static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
24440 SDLoc DL(N);
24441 SDValue ComparisonResult(N, 0);
24442 EVT VecVT = ComparisonResult.getValueType();
24443 assert(VecVT.isVector() && "Must be a vector type");
24444
24445 unsigned NumElts = VecVT.getVectorNumElements();
24446 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
24447 return SDValue();
24448
24449 if (VecVT.getVectorElementType() != MVT::i1 &&
24450 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
24451 return SDValue();
24452
24453 // If we can find the original types to work on instead of a vector of i1,
24454 // we can avoid extend/extract conversion instructions.
24455 if (VecVT.getVectorElementType() == MVT::i1) {
24456 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
24457 if (!VecVT.isSimple()) {
24458 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
24459 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
24460 }
24461 }
24462 VecVT = VecVT.changeVectorElementTypeToInteger();
24463
24464 // Large vectors don't map directly to this conversion, so to avoid too many
24465 // edge cases, we don't apply it here. The conversion will likely still be
24466 // applied later via multiple smaller vectors, whose results are concatenated.
24467 if (VecVT.getSizeInBits() > 128)
24468 return SDValue();
24469
24470 // Ensure that all elements' bits are either 0s or 1s.
24471 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
24472
24473 bool IsLE = DAG.getDataLayout().isLittleEndian();
24474 SmallVector<SDValue, 16> MaskConstants;
24476 VecVT == MVT::v16i8) {
24477 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
24478 // per entry. We split it into two halves, apply the mask, zip the halves to
24479 // create 8x 16-bit values, and the perform the vector reduce.
24480 for (unsigned Half = 0; Half < 2; ++Half) {
24481 for (unsigned I = 0; I < 8; ++I) {
24482 // On big-endian targets, the lane order in sub-byte vector elements
24483 // gets reversed, so we need to flip the bit index.
24484 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
24485 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
24486 }
24487 }
24488 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24489 SDValue RepresentativeBits =
24490 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24491
24492 SDValue UpperRepresentativeBits =
24493 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
24494 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
24495 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
24496 RepresentativeBits, UpperRepresentativeBits);
24497 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
24498 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
24499 }
24500
24501 // All other vector sizes.
24502 unsigned NumEl = VecVT.getVectorNumElements();
24503 for (unsigned I = 0; I < NumEl; ++I) {
24504 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
24505 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
24506 }
24507
24508 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24509 SDValue RepresentativeBits =
24510 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24511 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
24512 NumElts, VecVT.getVectorElementType().getSizeInBits()));
24513 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
24514}
24515
24516static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
24517 StoreSDNode *Store) {
24518 if (!Store->isTruncatingStore())
24519 return SDValue();
24520
24521 SDLoc DL(Store);
24522 SDValue VecOp = Store->getValue();
24523 EVT VT = VecOp.getValueType();
24524 EVT MemVT = Store->getMemoryVT();
24525
24526 if (!MemVT.isVector() || !VT.isVector() ||
24527 MemVT.getVectorElementType() != MVT::i1)
24528 return SDValue();
24529
24530 // If we are storing a vector that we are currently building, let
24531 // `scalarizeVectorStore()` handle this more efficiently.
24532 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
24533 return SDValue();
24534
24535 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
24536 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
24537 if (!VectorBits)
24538 return SDValue();
24539
24540 EVT StoreVT =
24542 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
24543 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
24544 Store->getMemOperand());
24545}
24546
24547// Combine store (fp_to_int X) to use vector semantics around the conversion
24548// when NEON is available. This allows us to store the in-vector result directly
24549// without transferring the result into a GPR in the process.
24550static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
24552 SelectionDAG &DAG,
24553 const AArch64Subtarget *Subtarget) {
24554 // Limit to post-legalization in order to avoid peeling truncating stores.
24555 if (DCI.isBeforeLegalize())
24556 return SDValue();
24557 if (!Subtarget->isNeonAvailable())
24558 return SDValue();
24559 // Source operand is already a vector.
24560 SDValue Value = ST->getValue();
24561 if (Value.getValueType().isVector())
24562 return SDValue();
24563
24564 // Look through potential assertions.
24565 while (Value->isAssert())
24566 Value = Value.getOperand(0);
24567
24568 if (Value.getOpcode() != ISD::FP_TO_SINT &&
24569 Value.getOpcode() != ISD::FP_TO_UINT)
24570 return SDValue();
24571 if (!Value->hasOneUse())
24572 return SDValue();
24573
24574 SDValue FPSrc = Value.getOperand(0);
24575 EVT SrcVT = FPSrc.getValueType();
24576 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24577 return SDValue();
24578
24579 // No support for assignments such as i64 = fp_to_sint i32
24580 EVT VT = Value.getSimpleValueType();
24581 if (VT != SrcVT.changeTypeToInteger())
24582 return SDValue();
24583
24584 // Create a 128-bit element vector to avoid widening. The floating point
24585 // conversion is transformed into a single element conversion via a pattern.
24586 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24587 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24588 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24589 SDLoc DL(ST);
24590 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24591 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24592
24594 SDValue Extracted =
24595 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24596
24597 DCI.CombineTo(ST->getValue().getNode(), Extracted);
24598 return SDValue(ST, 0);
24599}
24600
24601bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
24602 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
24603 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
24604 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
24605}
24606
24607// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
24608static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
24609 const AArch64Subtarget *Subtarget) {
24610 SDValue Value = ST->getValue();
24611 EVT ValueVT = Value.getValueType();
24612
24613 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
24614 Value.getOpcode() != ISD::TRUNCATE ||
24615 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
24616 return SDValue();
24617
24618 assert(ST->getOffset().isUndef() && "undef offset expected");
24619 SDLoc DL(ST);
24620 auto WideVT = EVT::getVectorVT(
24621 *DAG.getContext(),
24622 Value->getOperand(0).getValueType().getVectorElementType(), 4);
24623 SDValue UndefVector = DAG.getUNDEF(WideVT);
24624 SDValue WideTrunc = DAG.getNode(
24625 ISD::INSERT_SUBVECTOR, DL, WideVT,
24626 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
24627 SDValue Cast = DAG.getNode(
24628 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
24629 WideTrunc);
24630
24632 SDValue Chain = ST->getChain();
24633 MachineMemOperand *MMO = ST->getMemOperand();
24634 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
24635 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24636 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
24637 TypeSize Offset2 = TypeSize::getFixed(2);
24638 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
24639 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
24640
24641 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24642 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
24643 TypeSize Offset1 = TypeSize::getFixed(1);
24644 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
24645 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
24646
24647 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24648 DAG.getConstant(0, DL, MVT::i64));
24649 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
24650 MF.getMachineMemOperand(MMO, 0, 1));
24651 return Chain;
24652}
24653
24654static unsigned getFPSubregForVT(EVT VT) {
24655 assert(VT.isSimple() && "Expected simple VT");
24656 switch (VT.getSimpleVT().SimpleTy) {
24657 case MVT::aarch64mfp8:
24658 return AArch64::bsub;
24659 case MVT::f16:
24660 return AArch64::hsub;
24661 case MVT::f32:
24662 return AArch64::ssub;
24663 case MVT::f64:
24664 return AArch64::dsub;
24665 default:
24666 llvm_unreachable("Unexpected VT!");
24667 }
24668}
24669
24670static SDValue performSTORECombine(SDNode *N,
24672 SelectionDAG &DAG,
24673 const AArch64Subtarget *Subtarget) {
24675 SDValue Chain = ST->getChain();
24676 SDValue Value = ST->getValue();
24677 SDValue Ptr = ST->getBasePtr();
24678 EVT ValueVT = Value.getValueType();
24679 EVT MemVT = ST->getMemoryVT();
24680 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24681 SDLoc DL(ST);
24682
24683 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24684 return Res;
24685
24686 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
24687 EVT EltVT = VT.getVectorElementType();
24688 return EltVT == MVT::f32 || EltVT == MVT::f64;
24689 };
24690
24691 // Cast ptr32 and ptr64 pointers to the default address space before a store.
24692 unsigned AddrSpace = ST->getAddressSpace();
24693 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24694 AddrSpace == ARM64AS::PTR32_UPTR) {
24695 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24696 if (PtrVT != Ptr.getSimpleValueType()) {
24697 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
24698 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
24699 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
24700 ST->getAAInfo());
24701 }
24702 }
24703
24704 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
24705 return Res;
24706
24707 // If this is an FP_ROUND followed by a store, fold this into a truncating
24708 // store. We can do this even if this is already a truncstore.
24709 // We purposefully don't care about legality of the nodes here as we know
24710 // they can be split down into something legal.
24711 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
24712 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
24713 Subtarget->useSVEForFixedLengthVectors() &&
24714 ValueVT.isFixedLengthVector() &&
24715 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
24716 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
24717 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
24718 ST->getMemOperand());
24719
24720 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
24721 return Split;
24722
24723 if (Subtarget->supportsAddressTopByteIgnored() &&
24724 performTBISimplification(N->getOperand(2), DCI, DAG))
24725 return SDValue(N, 0);
24726
24727 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
24728 return Store;
24729
24730 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
24731 return Store;
24732
24733 if (ST->isTruncatingStore() &&
24734 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
24735 if (SDValue Rshrnb =
24736 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
24737 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24738 MemVT, ST->getMemOperand());
24739 }
24740 }
24741
24742 // This is an integer vector_extract_elt followed by a (possibly truncating)
24743 // store. We may be able to replace this with a store of an FP subregister.
24744 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24745 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24746
24747 SDValue Vector = Value.getOperand(0);
24748 SDValue ExtIdx = Value.getOperand(1);
24749 EVT VectorVT = Vector.getValueType();
24750 EVT ElemVT = VectorVT.getVectorElementType();
24751
24752 if (!ValueVT.isInteger())
24753 return SDValue();
24754
24755 // Propagate zero constants (applying this fold may miss optimizations).
24757 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
24758 DAG.ReplaceAllUsesWith(Value, ZeroElt);
24759 return SDValue();
24760 }
24761
24762 if (ValueVT != MemVT && !ST->isTruncatingStore())
24763 return SDValue();
24764
24765 // This could generate an additional extract if the index is non-zero and
24766 // the extracted value has multiple uses.
24767 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24768 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24769 return SDValue();
24770
24771 // These can lower to st1, which is preferable if we're unlikely to fold the
24772 // addressing into the store.
24773 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24774 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24775 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24776 return SDValue();
24777
24778 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24779 // Heuristic: If there are other users of w/x integer scalars extracted
24780 // from this vector that won't fold into the store -- abandon folding.
24781 // Applying this fold may disrupt paired stores.
24782 for (const auto &Use : Vector->uses()) {
24783 if (Use.getResNo() != Vector.getResNo())
24784 continue;
24785 const SDNode *User = Use.getUser();
24786 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24787 (!User->hasOneUse() ||
24788 (*User->user_begin())->getOpcode() != ISD::STORE))
24789 return SDValue();
24790 }
24791 }
24792
24793 SDValue ExtVector = Vector;
24794 if (!ExtCst || !ExtCst->isZero()) {
24795 // Handle extracting from lanes != 0.
24797 Value.getValueType(), Vector, ExtIdx);
24799 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
24800 DAG.getUNDEF(VectorVT), Ext, Zero);
24801 }
24802
24803 EVT FPMemVT = MemVT == MVT::i8
24804 ? MVT::aarch64mfp8
24806 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24807 FPMemVT, ExtVector);
24808
24809 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
24810 ST->getMemOperand());
24811 }
24812
24813 return SDValue();
24814}
24815
24816static bool
24817isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
24818 if (N->getOpcode() != ISD::CONCAT_VECTORS)
24819 return false;
24820
24821 unsigned NumParts = N->getNumOperands();
24822
24823 // We should be concatenating each sequential result from a
24824 // VECTOR_INTERLEAVE.
24825 SDNode *InterleaveOp = N->getOperand(0).getNode();
24826 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
24827 InterleaveOp->getNumOperands() != NumParts)
24828 return false;
24829
24830 for (unsigned I = 0; I < NumParts; I++)
24831 if (N->getOperand(I) != SDValue(InterleaveOp, I))
24832 return false;
24833
24834 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
24835 return true;
24836}
24837
24838static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
24839 SDValue WideMask,
24840 unsigned RequiredNumParts) {
24841 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
24842 SmallVector<SDValue, 4> MaskInterleaveOps;
24843 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
24844 MaskInterleaveOps))
24845 return SDValue();
24846
24847 if (MaskInterleaveOps.size() != RequiredNumParts)
24848 return SDValue();
24849
24850 // Make sure the inputs to the vector interleave are identical.
24851 if (!llvm::all_equal(MaskInterleaveOps))
24852 return SDValue();
24853
24854 return MaskInterleaveOps[0];
24855 }
24856
24857 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
24858 return SDValue();
24859
24861 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
24862 "Expected element count divisible by number of parts");
24863 EC = EC.divideCoefficientBy(RequiredNumParts);
24864 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
24865 WideMask->getOperand(0));
24866}
24867
24868static SDValue performInterleavedMaskedStoreCombine(
24870 if (!DCI.isBeforeLegalize())
24871 return SDValue();
24872
24874 SDValue WideValue = MST->getValue();
24875
24876 // Bail out if the stored value has an unexpected number of uses, since we'll
24877 // have to perform manual interleaving and may as well just use normal masked
24878 // stores. Also, discard masked stores that are truncating or indexed.
24879 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
24880 !MST->isSimple() || !MST->getOffset().isUndef())
24881 return SDValue();
24882
24883 SmallVector<SDValue, 4> ValueInterleaveOps;
24884 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
24885 ValueInterleaveOps))
24886 return SDValue();
24887
24888 unsigned NumParts = ValueInterleaveOps.size();
24889 if (NumParts != 2 && NumParts != 4)
24890 return SDValue();
24891
24892 // At the moment we're unlikely to see a fixed-width vector interleave as
24893 // we usually generate shuffles instead.
24894 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
24895 if (!SubVecTy.isScalableVT() ||
24896 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
24897 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
24898 return SDValue();
24899
24900 SDLoc DL(N);
24901 SDValue NarrowMask =
24902 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
24903 if (!NarrowMask)
24904 return SDValue();
24905
24906 const Intrinsic::ID IID =
24907 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
24908 SmallVector<SDValue, 8> NewStOps;
24909 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
24910 NewStOps.append(ValueInterleaveOps);
24911 NewStOps.append({NarrowMask, MST->getBasePtr()});
24912 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
24913}
24914
24915static SDValue performMSTORECombine(SDNode *N,
24917 SelectionDAG &DAG,
24918 const AArch64Subtarget *Subtarget) {
24920 SDValue Value = MST->getValue();
24921 SDValue Mask = MST->getMask();
24922 SDLoc DL(N);
24923
24924 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
24925 return Res;
24926
24927 // If this is a UZP1 followed by a masked store, fold this into a masked
24928 // truncating store. We can do this even if this is already a masked
24929 // truncstore.
24930 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24931 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24932 Value.getValueType().isInteger()) {
24933 Value = Value.getOperand(0);
24934 if (Value.getOpcode() == ISD::BITCAST) {
24935 EVT HalfVT =
24936 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
24937 EVT InVT = Value.getOperand(0).getValueType();
24938
24939 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
24940 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24941 unsigned PgPattern = Mask->getConstantOperandVal(0);
24942
24943 // Ensure we can double the size of the predicate pattern
24944 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24945 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24946 MinSVESize) {
24947 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
24948 PgPattern);
24949 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
24950 MST->getBasePtr(), MST->getOffset(), Mask,
24951 MST->getMemoryVT(), MST->getMemOperand(),
24952 MST->getAddressingMode(),
24953 /*IsTruncating=*/true);
24954 }
24955 }
24956 }
24957 }
24958
24959 if (MST->isTruncatingStore()) {
24960 EVT ValueVT = Value->getValueType(0);
24961 EVT MemVT = MST->getMemoryVT();
24962 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
24963 return SDValue();
24964 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24965 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24966 MST->getOffset(), MST->getMask(),
24967 MST->getMemoryVT(), MST->getMemOperand(),
24968 MST->getAddressingMode(), true);
24969 }
24970 }
24971
24972 return SDValue();
24973}
24974
24975/// \return true if part of the index was folded into the Base.
24976static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24977 SDLoc DL, SelectionDAG &DAG) {
24978 // This function assumes a vector of i64 indices.
24979 EVT IndexVT = Index.getValueType();
24980 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24981 return false;
24982
24983 // Simplify:
24984 // BasePtr = Ptr
24985 // Index = X + splat(Offset)
24986 // ->
24987 // BasePtr = Ptr + Offset * scale.
24988 // Index = X
24989 if (Index.getOpcode() == ISD::ADD) {
24990 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24991 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24992 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24993 Index = Index.getOperand(0);
24994 return true;
24995 }
24996 }
24997
24998 // Simplify:
24999 // BasePtr = Ptr
25000 // Index = (X + splat(Offset)) << splat(Shift)
25001 // ->
25002 // BasePtr = Ptr + (Offset << Shift) * scale)
25003 // Index = X << splat(shift)
25004 if (Index.getOpcode() == ISD::SHL &&
25005 Index.getOperand(0).getOpcode() == ISD::ADD) {
25006 SDValue Add = Index.getOperand(0);
25007 SDValue ShiftOp = Index.getOperand(1);
25008 SDValue OffsetOp = Add.getOperand(1);
25009 if (auto Shift = DAG.getSplatValue(ShiftOp))
25010 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
25011 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
25012 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
25013 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
25014 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
25015 Add.getOperand(0), ShiftOp);
25016 return true;
25017 }
25018 }
25019
25020 return false;
25021}
25022
25023// Analyse the specified address returning true if a more optimal addressing
25024// mode is available. When returning true all parameters are updated to reflect
25025// their recommended values.
25027 SDValue &BasePtr, SDValue &Index,
25028 SelectionDAG &DAG) {
25029 // Try to iteratively fold parts of the index into the base pointer to
25030 // simplify the index as much as possible.
25031 bool Changed = false;
25032 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
25033 Changed = true;
25034
25035 // Only consider element types that are pointer sized as smaller types can
25036 // be easily promoted.
25037 EVT IndexVT = Index.getValueType();
25038 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
25039 return Changed;
25040
25041 // Can indices be trivially shrunk?
25042 EVT DataVT = N->getOperand(1).getValueType();
25043 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
25044 // will later be re-extended to 64 bits in legalization
25045 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
25046 return Changed;
25047 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
25048 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25049 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
25050 return true;
25051 }
25052
25053 // Match:
25054 // Index = step(const)
25055 int64_t Stride = 0;
25056 if (Index.getOpcode() == ISD::STEP_VECTOR) {
25057 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
25058 }
25059 // Match:
25060 // Index = step(const) << shift(const)
25061 else if (Index.getOpcode() == ISD::SHL &&
25062 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
25063 SDValue RHS = Index.getOperand(1);
25064 if (auto *Shift =
25066 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
25067 Stride = Step << Shift->getZExtValue();
25068 }
25069 }
25070
25071 // Return early because no supported pattern is found.
25072 if (Stride == 0)
25073 return Changed;
25074
25075 if (Stride < std::numeric_limits<int32_t>::min() ||
25076 Stride > std::numeric_limits<int32_t>::max())
25077 return Changed;
25078
25079 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25080 unsigned MaxVScale =
25082 int64_t LastElementOffset =
25083 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
25084
25085 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
25086 LastElementOffset > std::numeric_limits<int32_t>::max())
25087 return Changed;
25088
25089 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25090 // Stride does not scale explicitly by 'Scale', because it happens in
25091 // the gather/scatter addressing mode.
25092 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
25093 return true;
25094}
25095
25098 if (!DCI.isBeforeLegalize())
25099 return SDValue();
25101
25102 SDLoc DL(MGS);
25103 SDValue Chain = MGS->getChain();
25104 SDValue Scale = MGS->getScale();
25105 SDValue Index = MGS->getIndex();
25106 SDValue Mask = MGS->getMask();
25107 SDValue BasePtr = MGS->getBasePtr();
25108 ISD::MemIndexType IndexType = MGS->getIndexType();
25109
25110 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
25111 return SDValue();
25112
25113 // Here we catch such cases early and change MGATHER's IndexType to allow
25114 // the use of an Index that's more legalisation friendly.
25115 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
25116 SDValue PassThru = MGT->getPassThru();
25117 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
25118 return DAG.getMaskedGather(
25119 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
25120 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
25121 }
25122 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
25123 SDValue Data = MSC->getValue();
25124 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
25125 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
25126 DL, Ops, MSC->getMemOperand(), IndexType,
25127 MSC->isTruncatingStore());
25128 }
25129 auto *HG = cast<MaskedHistogramSDNode>(MGS);
25130 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
25131 Index, Scale, HG->getIntID()};
25132 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
25133 DL, Ops, HG->getMemOperand(), IndexType);
25134}
25135
25136/// Target-specific DAG combine function for NEON load/store intrinsics
25137/// to merge base address updates.
25140 SelectionDAG &DAG) {
25141 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
25142 return SDValue();
25143
25144 unsigned AddrOpIdx = N->getNumOperands() - 1;
25145 SDValue Addr = N->getOperand(AddrOpIdx);
25146
25147 // Search for a use of the address operand that is an increment.
25148 for (SDUse &Use : Addr->uses()) {
25149 SDNode *User = Use.getUser();
25150 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
25151 continue;
25152
25153 // Check that the add is independent of the load/store. Otherwise, folding
25154 // it would create a cycle.
25157 Visited.insert(Addr.getNode());
25158 Worklist.push_back(N);
25159 Worklist.push_back(User);
25160 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
25161 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25162 continue;
25163
25164 // Find the new opcode for the updating load/store.
25165 bool IsStore = false;
25166 bool IsLaneOp = false;
25167 bool IsDupOp = false;
25168 unsigned NewOpc = 0;
25169 unsigned NumVecs = 0;
25170 unsigned IntNo = N->getConstantOperandVal(1);
25171 switch (IntNo) {
25172 default: llvm_unreachable("unexpected intrinsic for Neon base update");
25173 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
25174 NumVecs = 2; break;
25175 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
25176 NumVecs = 3; break;
25177 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
25178 NumVecs = 4; break;
25179 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
25180 NumVecs = 2; IsStore = true; break;
25181 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
25182 NumVecs = 3; IsStore = true; break;
25183 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
25184 NumVecs = 4; IsStore = true; break;
25185 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
25186 NumVecs = 2; break;
25187 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
25188 NumVecs = 3; break;
25189 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
25190 NumVecs = 4; break;
25191 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
25192 NumVecs = 2; IsStore = true; break;
25193 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
25194 NumVecs = 3; IsStore = true; break;
25195 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
25196 NumVecs = 4; IsStore = true; break;
25197 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
25198 NumVecs = 2; IsDupOp = true; break;
25199 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
25200 NumVecs = 3; IsDupOp = true; break;
25201 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
25202 NumVecs = 4; IsDupOp = true; break;
25203 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
25204 NumVecs = 2; IsLaneOp = true; break;
25205 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
25206 NumVecs = 3; IsLaneOp = true; break;
25207 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
25208 NumVecs = 4; IsLaneOp = true; break;
25209 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
25210 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
25211 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
25212 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
25213 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
25214 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
25215 }
25216
25217 EVT VecTy;
25218 if (IsStore)
25219 VecTy = N->getOperand(2).getValueType();
25220 else
25221 VecTy = N->getValueType(0);
25222
25223 // If the increment is a constant, it must match the memory ref size.
25224 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25225 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25226 uint32_t IncVal = CInc->getZExtValue();
25227 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
25228 if (IsLaneOp || IsDupOp)
25229 NumBytes /= VecTy.getVectorNumElements();
25230 if (IncVal != NumBytes)
25231 continue;
25232 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25233 }
25235 Ops.push_back(N->getOperand(0)); // Incoming chain
25236 // Load lane and store have vector list as input.
25237 if (IsLaneOp || IsStore)
25238 for (unsigned i = 2; i < AddrOpIdx; ++i)
25239 Ops.push_back(N->getOperand(i));
25240 Ops.push_back(Addr); // Base register
25241 Ops.push_back(Inc);
25242
25243 // Return Types.
25244 EVT Tys[6];
25245 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
25246 unsigned n;
25247 for (n = 0; n < NumResultVecs; ++n)
25248 Tys[n] = VecTy;
25249 Tys[n++] = MVT::i64; // Type of write back register
25250 Tys[n] = MVT::Other; // Type of the chain
25251 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
25252
25254 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
25255 MemInt->getMemoryVT(),
25256 MemInt->getMemOperand());
25257
25258 // Update the uses.
25259 std::vector<SDValue> NewResults;
25260 for (unsigned i = 0; i < NumResultVecs; ++i) {
25261 NewResults.push_back(SDValue(UpdN.getNode(), i));
25262 }
25263 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
25264 DCI.CombineTo(N, NewResults);
25265 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
25266
25267 break;
25268 }
25269 return SDValue();
25270}
25271
25272// Checks to see if the value is the prescribed width and returns information
25273// about its extension mode.
25274static
25275bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
25276 ExtType = ISD::NON_EXTLOAD;
25277 switch(V.getNode()->getOpcode()) {
25278 default:
25279 return false;
25280 case ISD::LOAD: {
25281 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
25282 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
25283 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25284 ExtType = LoadNode->getExtensionType();
25285 return true;
25286 }
25287 return false;
25288 }
25289 case ISD::AssertSext: {
25290 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25291 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25292 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25293 ExtType = ISD::SEXTLOAD;
25294 return true;
25295 }
25296 return false;
25297 }
25298 case ISD::AssertZext: {
25299 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25300 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25301 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25302 ExtType = ISD::ZEXTLOAD;
25303 return true;
25304 }
25305 return false;
25306 }
25307 case ISD::Constant:
25308 case ISD::TargetConstant: {
25309 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25310 1LL << (width - 1);
25311 }
25312 }
25313
25314 return true;
25315}
25316
25317// This function does a whole lot of voodoo to determine if the tests are
25318// equivalent without and with a mask. Essentially what happens is that given a
25319// DAG resembling:
25320//
25321// +-------------+ +-------------+ +-------------+ +-------------+
25322// | Input | | AddConstant | | CompConstant| | CC |
25323// +-------------+ +-------------+ +-------------+ +-------------+
25324// | | | |
25325// V V | +----------+
25326// +-------------+ +----+ | |
25327// | ADD | |0xff| | |
25328// +-------------+ +----+ | |
25329// | | | |
25330// V V | |
25331// +-------------+ | |
25332// | AND | | |
25333// +-------------+ | |
25334// | | |
25335// +-----+ | |
25336// | | |
25337// V V V
25338// +-------------+
25339// | CMP |
25340// +-------------+
25341//
25342// The AND node may be safely removed for some combinations of inputs. In
25343// particular we need to take into account the extension type of the Input,
25344// the exact values of AddConstant, CompConstant, and CC, along with the nominal
25345// width of the input (this can work for any width inputs, the above graph is
25346// specific to 8 bits.
25347//
25348// The specific equations were worked out by generating output tables for each
25349// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
25350// problem was simplified by working with 4 bit inputs, which means we only
25351// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
25352// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
25353// patterns present in both extensions (0,7). For every distinct set of
25354// AddConstant and CompConstants bit patterns we can consider the masked and
25355// unmasked versions to be equivalent if the result of this function is true for
25356// all 16 distinct bit patterns of for the current extension type of Input (w0).
25357//
25358// sub w8, w0, w1
25359// and w10, w8, #0x0f
25360// cmp w8, w2
25361// cset w9, AArch64CC
25362// cmp w10, w2
25363// cset w11, AArch64CC
25364// cmp w9, w11
25365// cset w0, eq
25366// ret
25367//
25368// Since the above function shows when the outputs are equivalent it defines
25369// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
25370// would be expensive to run during compiles. The equations below were written
25371// in a test harness that confirmed they gave equivalent outputs to the above
25372// for all inputs function, so they can be used determine if the removal is
25373// legal instead.
25374//
25375// isEquivalentMaskless() is the code for testing if the AND can be removed
25376// factored out of the DAG recognition as the DAG can take several forms.
25377
25378static bool isEquivalentMaskless(unsigned CC, unsigned width,
25379 ISD::LoadExtType ExtType, int AddConstant,
25380 int CompConstant) {
25381 // By being careful about our equations and only writing the in term
25382 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
25383 // make them generally applicable to all bit widths.
25384 int MaxUInt = (1 << width);
25385
25386 // For the purposes of these comparisons sign extending the type is
25387 // equivalent to zero extending the add and displacing it by half the integer
25388 // width. Provided we are careful and make sure our equations are valid over
25389 // the whole range we can just adjust the input and avoid writing equations
25390 // for sign extended inputs.
25391 if (ExtType == ISD::SEXTLOAD)
25392 AddConstant -= (1 << (width-1));
25393
25394 switch(CC) {
25395 case AArch64CC::LE:
25396 case AArch64CC::GT:
25397 if ((AddConstant == 0) ||
25398 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
25399 (AddConstant >= 0 && CompConstant < 0) ||
25400 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
25401 return true;
25402 break;
25403 case AArch64CC::LT:
25404 case AArch64CC::GE:
25405 if ((AddConstant == 0) ||
25406 (AddConstant >= 0 && CompConstant <= 0) ||
25407 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
25408 return true;
25409 break;
25410 case AArch64CC::HI:
25411 case AArch64CC::LS:
25412 if ((AddConstant >= 0 && CompConstant < 0) ||
25413 (AddConstant <= 0 && CompConstant >= -1 &&
25414 CompConstant < AddConstant + MaxUInt))
25415 return true;
25416 break;
25417 case AArch64CC::PL:
25418 case AArch64CC::MI:
25419 if ((AddConstant == 0) ||
25420 (AddConstant > 0 && CompConstant <= 0) ||
25421 (AddConstant < 0 && CompConstant <= AddConstant))
25422 return true;
25423 break;
25424 case AArch64CC::LO:
25425 case AArch64CC::HS:
25426 if ((AddConstant >= 0 && CompConstant <= 0) ||
25427 (AddConstant <= 0 && CompConstant >= 0 &&
25428 CompConstant <= AddConstant + MaxUInt))
25429 return true;
25430 break;
25431 case AArch64CC::EQ:
25432 case AArch64CC::NE:
25433 if ((AddConstant > 0 && CompConstant < 0) ||
25434 (AddConstant < 0 && CompConstant >= 0 &&
25435 CompConstant < AddConstant + MaxUInt) ||
25436 (AddConstant >= 0 && CompConstant >= 0 &&
25437 CompConstant >= AddConstant) ||
25438 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
25439 return true;
25440 break;
25441 case AArch64CC::VS:
25442 case AArch64CC::VC:
25443 case AArch64CC::AL:
25444 case AArch64CC::NV:
25445 return true;
25446 case AArch64CC::Invalid:
25447 break;
25448 }
25449
25450 return false;
25451}
25452
25453// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
25454// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
25456 SDNode *AndNode, SelectionDAG &DAG,
25457 unsigned CCIndex, unsigned CmpIndex,
25458 unsigned CC) {
25459 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
25460 if (!SubsC)
25461 return SDValue();
25462
25463 APInt SubsAP = SubsC->getAPIntValue();
25464 if (CC == AArch64CC::HI) {
25465 if (!SubsAP.isMask())
25466 return SDValue();
25467 } else if (CC == AArch64CC::LO) {
25468 if (!SubsAP.isPowerOf2())
25469 return SDValue();
25470 } else
25471 return SDValue();
25472
25474 if (!AndC)
25475 return SDValue();
25476
25477 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
25478
25479 SDLoc DL(N);
25480 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
25481 SDValue ANDS = DAG.getNode(
25482 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
25483 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
25484 SDValue AArch64_CC =
25486 N->getOperand(CCIndex)->getValueType(0));
25487
25488 // For now, only performCSELCombine and performBRCONDCombine call this
25489 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
25490 // operands. So just init the ops direct to simplify the code. If we have some
25491 // other case with different CCIndex, CmpIndex, we need to use for loop to
25492 // rewrite the code here.
25493 // TODO: Do we need to assert number of operand is 4 here?
25494 assert((CCIndex == 2 && CmpIndex == 3) &&
25495 "Expected CCIndex to be 2 and CmpIndex to be 3.");
25496 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
25497 ANDS.getValue(1)};
25498 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
25499}
25500
25501static
25504 SelectionDAG &DAG, unsigned CCIndex,
25505 unsigned CmpIndex) {
25506 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
25507 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
25508 unsigned CondOpcode = SubsNode->getOpcode();
25509
25510 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
25511 !SubsNode->hasOneUse())
25512 return SDValue();
25513
25514 // There is a SUBS feeding this condition. Is it fed by a mask we can
25515 // use?
25516
25517 SDNode *AndNode = SubsNode->getOperand(0).getNode();
25518 unsigned MaskBits = 0;
25519
25520 if (AndNode->getOpcode() != ISD::AND)
25521 return SDValue();
25522
25523 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
25524 CmpIndex, CC))
25525 return Val;
25526
25527 // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
25528 // sequence of ones starting at the least significant bit with the remainder
25529 // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
25530 // into a SUBS (immediate). The transformed form can be matched into a SUBS
25531 // (shifted register).
25532 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
25533 isa<ConstantSDNode>(AndNode->getOperand(1)) &&
25534 isa<ConstantSDNode>(SubsNode->getOperand(1))) {
25535 SDValue X = AndNode->getOperand(0);
25536 APInt M = AndNode->getConstantOperandAPInt(1);
25537 APInt C = SubsNode->getConstantOperandAPInt(1);
25538
25539 if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
25540 SDLoc DL(SubsNode);
25541 EVT VT = SubsNode->getValueType(0);
25542 unsigned ShiftAmt = M.countl_zero();
25543 SDValue ShiftedX = DAG.getNode(
25544 ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
25545 SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
25546 SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
25547 ShiftedC, ShiftedX);
25548 DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
25549 return SDValue(N, 0);
25550 }
25551 }
25552
25553 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
25554 uint32_t CNV = CN->getZExtValue();
25555 if (CNV == 255)
25556 MaskBits = 8;
25557 else if (CNV == 65535)
25558 MaskBits = 16;
25559 }
25560
25561 if (!MaskBits)
25562 return SDValue();
25563
25564 SDValue AddValue = AndNode->getOperand(0);
25565
25566 if (AddValue.getOpcode() != ISD::ADD)
25567 return SDValue();
25568
25569 // The basic dag structure is correct, grab the inputs and validate them.
25570
25571 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
25572 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
25573 SDValue SubsInputValue = SubsNode->getOperand(1);
25574
25575 // The mask is present and the provenance of all the values is a smaller type,
25576 // lets see if the mask is superfluous.
25577
25578 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
25579 !isa<ConstantSDNode>(SubsInputValue.getNode()))
25580 return SDValue();
25581
25582 ISD::LoadExtType ExtType;
25583
25584 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
25585 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
25586 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
25587 return SDValue();
25588
25589 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
25590 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
25591 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
25592 return SDValue();
25593
25594 // The AND is not necessary, remove it.
25595
25596 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
25597 SubsNode->getValueType(1));
25598 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
25599
25600 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
25601 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
25602
25603 return SDValue(N, 0);
25604}
25605
25606// Optimize compare with zero and branch.
25609 SelectionDAG &DAG) {
25611 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
25612 // will not be produced, as they are conditional branch instructions that do
25613 // not set flags.
25614 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
25615 return SDValue();
25616
25617 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
25618 N = NV.getNode();
25619 SDValue Chain = N->getOperand(0);
25620 SDValue Dest = N->getOperand(1);
25621 SDValue CCVal = N->getOperand(2);
25622 SDValue Cmp = N->getOperand(3);
25623
25624 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
25625 unsigned CC = CCVal->getAsZExtVal();
25626 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
25627 return SDValue();
25628
25629 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
25630 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
25631 SDValue CSel = Cmp.getOperand(0);
25632 auto CSelCC = getCSETCondCode(CSel);
25633 if (CSelCC) {
25634 SDLoc DL(N);
25635 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
25636 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
25637 CSel.getOperand(3));
25638 }
25639 }
25640
25641 unsigned CmpOpc = Cmp.getOpcode();
25642 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
25643 return SDValue();
25644
25645 // Only attempt folding if there is only one use of the flag and no use of the
25646 // value.
25647 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
25648 return SDValue();
25649
25650 SDValue LHS = Cmp.getOperand(0);
25651 SDValue RHS = Cmp.getOperand(1);
25652
25653 assert(LHS.getValueType() == RHS.getValueType() &&
25654 "Expected the value type to be the same for both operands!");
25655 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
25656 return SDValue();
25657
25658 if (isNullConstant(LHS))
25659 std::swap(LHS, RHS);
25660
25661 if (!isNullConstant(RHS))
25662 return SDValue();
25663
25664 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
25665 LHS.getOpcode() == ISD::SRL)
25666 return SDValue();
25667
25668 // Fold the compare into the branch instruction.
25669 SDValue BR;
25670 if (CC == AArch64CC::EQ)
25671 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25672 else
25673 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25674
25675 // Do not add new nodes to DAG combiner worklist.
25676 DCI.CombineTo(N, BR, false);
25677
25678 return SDValue();
25679}
25680
25682 unsigned CC = N->getConstantOperandVal(2);
25683 SDValue SUBS = N->getOperand(3);
25684 SDValue Zero, CTTZ;
25685
25686 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
25687 Zero = N->getOperand(0);
25688 CTTZ = N->getOperand(1);
25689 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
25690 Zero = N->getOperand(1);
25691 CTTZ = N->getOperand(0);
25692 } else
25693 return SDValue();
25694
25695 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
25696 (CTTZ.getOpcode() == ISD::TRUNCATE &&
25697 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
25698 return SDValue();
25699
25700 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
25701 "Illegal type in CTTZ folding");
25702
25703 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
25704 return SDValue();
25705
25706 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
25707 ? CTTZ.getOperand(0).getOperand(0)
25708 : CTTZ.getOperand(0);
25709
25710 if (X != SUBS.getOperand(0))
25711 return SDValue();
25712
25713 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
25714 ? CTTZ.getOperand(0).getValueSizeInBits()
25715 : CTTZ.getValueSizeInBits();
25716 SDValue BitWidthMinusOne =
25717 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
25718 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
25719 BitWidthMinusOne);
25720}
25721
25722// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
25723// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
25724// Where x and y are constants and x != y
25725
25726// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
25727// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
25728// Where x and y are constants and x != y
25730 SDValue L = Op->getOperand(0);
25731 SDValue R = Op->getOperand(1);
25732 AArch64CC::CondCode OpCC =
25733 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25734
25735 SDValue OpCmp = Op->getOperand(3);
25736 if (!isCMP(OpCmp))
25737 return SDValue();
25738
25739 SDValue CmpLHS = OpCmp.getOperand(0);
25740 SDValue CmpRHS = OpCmp.getOperand(1);
25741
25742 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
25743 std::swap(CmpLHS, CmpRHS);
25744 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
25745 return SDValue();
25746
25747 SDValue X = CmpLHS->getOperand(0);
25748 SDValue Y = CmpLHS->getOperand(1);
25749 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
25750 return SDValue();
25751 }
25752
25753 // If one of the constant is opaque constant, x,y sdnode is still different
25754 // but the real value maybe the same. So check APInt here to make sure the
25755 // code is correct.
25758 if (CX->getAPIntValue() == CY->getAPIntValue())
25759 return SDValue();
25760
25762 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
25763 SDValue Cond = CmpLHS->getOperand(3);
25764
25765 if (CmpRHS == Y)
25767 else if (CmpRHS != X)
25768 return SDValue();
25769
25770 if (OpCC == AArch64CC::NE)
25772 else if (OpCC != AArch64CC::EQ)
25773 return SDValue();
25774
25775 SDLoc DL(Op);
25776 EVT VT = Op->getValueType(0);
25777
25778 SDValue CCValue = getCondCode(DAG, CC);
25779 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
25780}
25781
25782// Reassociate the true/false expressions of a CSEL instruction to obtain a
25783// common subexpression with the comparison instruction. For example, change
25784// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
25785// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
25786// subexpression.
25788 SDValue SubsNode = N->getOperand(3);
25789 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
25790 return SDValue();
25791
25792 SDValue CmpOpToMatch = SubsNode.getOperand(1);
25793 SDValue CmpOpOther = SubsNode.getOperand(0);
25794 EVT VT = N->getValueType(0);
25795
25796 unsigned ExpectedOpcode;
25797 SDValue ExpectedOp;
25798 SDValue SubsOp;
25799 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
25800 if (CmpOpConst) {
25801 ExpectedOpcode = ISD::ADD;
25802 ExpectedOp =
25803 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25804 CmpOpConst->getValueType(0));
25805 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25806 CmpOpConst->getValueType(0));
25807 } else {
25808 ExpectedOpcode = ISD::SUB;
25809 ExpectedOp = CmpOpToMatch;
25810 SubsOp = CmpOpToMatch;
25811 }
25812
25813 // Get the operand that can be reassociated with the SUBS instruction.
25814 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
25815 if (Op.getOpcode() != ExpectedOpcode)
25816 return SDValue();
25817 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
25818 !Op.getOperand(0).hasOneUse())
25819 return SDValue();
25820 SDValue X = Op.getOperand(0).getOperand(0);
25821 SDValue Y = Op.getOperand(0).getOperand(1);
25822 if (X != CmpOpOther)
25823 std::swap(X, Y);
25824 if (X != CmpOpOther)
25825 return SDValue();
25826 if (ExpectedOp != Op.getOperand(1))
25827 return SDValue();
25828 return Y;
25829 };
25830
25831 // Try the reassociation using the given constant and condition code.
25832 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
25833 SDValue SubsOp) {
25834 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
25835 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
25836 if (!TReassocOp && !FReassocOp)
25837 return SDValue();
25838
25839 SDValue NewCmp =
25840 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
25841 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
25842
25843 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
25844 if (!ReassocOp)
25845 return N->getOperand(OpNum);
25846 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
25847 NewCmp.getValue(0), ReassocOp);
25848 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
25849 return Res;
25850 };
25851
25852 SDValue TValReassoc = Reassociate(TReassocOp, 0);
25853 SDValue FValReassoc = Reassociate(FReassocOp, 1);
25854 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
25855 getCondCode(DAG, NewCC), NewCmp.getValue(1));
25856 };
25857
25858 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25859
25860 // First, try to eliminate the compare instruction by searching for a
25861 // subtraction with the same constant.
25862 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
25863 return R;
25864
25865 if (!CmpOpConst) {
25866 // Try again with the operands of the SUBS instruction and the condition
25867 // swapped. Due to canonicalization, this only helps for non-constant
25868 // operands of the SUBS instruction.
25869 std::swap(CmpOpToMatch, CmpOpOther);
25870 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
25871 return R;
25872 return SDValue();
25873 }
25874
25875 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
25876 return SDValue();
25877
25878 // Next, search for a subtraction with a slightly different constant. By
25879 // adjusting the condition code, we can still eliminate the compare
25880 // instruction. Adjusting the constant is only valid if it does not result
25881 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
25882 // Since such comparisons are trivially true/false, we should not encounter
25883 // them here but check for them nevertheless to be on the safe side.
25884 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
25885 AArch64CC::CondCode NewCC) {
25886 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
25887 CmpOpConst->getValueType(0));
25888 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
25889 CmpOpConst->getValueType(0));
25890 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25891 };
25892 switch (CC) {
25893 case AArch64CC::EQ:
25894 case AArch64CC::LS:
25895 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25896 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25897 case AArch64CC::NE:
25898 case AArch64CC::HI:
25899 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25900 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25901 case AArch64CC::LO:
25902 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25903 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25904 case AArch64CC::HS:
25905 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25906 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25907 case AArch64CC::LT:
25908 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25909 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25910 case AArch64CC::LE:
25911 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25912 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25913 case AArch64CC::GT:
25914 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25915 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25916 case AArch64CC::GE:
25917 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25918 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25919 default:
25920 return SDValue();
25921 }
25922}
25923
25925 AArch64CC::CondCode OpCC =
25926 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25927
25928 if (OpCC != AArch64CC::NE)
25929 return SDValue();
25930
25931 SDValue PTest = Op->getOperand(3);
25932 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25933 return SDValue();
25934
25935 SDValue TruePred = PTest.getOperand(0);
25936 SDValue AnyPred = PTest.getOperand(1);
25937
25938 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25939 TruePred = TruePred.getOperand(0);
25940
25941 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25942 AnyPred = AnyPred.getOperand(0);
25943
25944 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
25945 return SDValue();
25946
25947 SDValue LastB = Op->getOperand(0);
25948 SDValue Default = Op->getOperand(1);
25949
25950 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
25951 return SDValue();
25952
25953 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
25954 AnyPred, Default, LastB.getOperand(1));
25955}
25956
25957// Optimize CSEL instructions
25960 SelectionDAG &DAG) {
25961 // CSEL x, x, cc -> x
25962 if (N->getOperand(0) == N->getOperand(1))
25963 return N->getOperand(0);
25964
25965 if (SDValue R = foldCSELOfCSEL(N, DAG))
25966 return R;
25967
25968 // Try to reassociate the true/false expressions so that we can do CSE with
25969 // a SUBS instruction used to perform the comparison.
25971 return R;
25972
25973 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25974 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25975 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25976 return Folded;
25977
25978 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25979 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25980 SDValue Cond = N->getOperand(3);
25981 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25982 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
25983 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25984 {Cond.getOperand(1), Cond.getOperand(0)}) &&
25985 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25986 {Cond.getOperand(0), Cond.getOperand(1)}) &&
25987 !isNullConstant(Cond.getOperand(1))) {
25988 AArch64CC::CondCode OldCond =
25989 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25990 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
25991 if (NewCond != AArch64CC::AL) {
25992 SDLoc DL(N);
25993 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25994 Cond.getOperand(1), Cond.getOperand(0));
25995 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
25996 N->getOperand(1), getCondCode(DAG, NewCond),
25997 Sub.getValue(1));
25998 }
25999 }
26000
26001 // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
26002 // use overflow flags, to avoid the comparison with zero. In case of success,
26003 // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
26004 // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
26005 // nodes with their SUBS equivalent as is already done for other flag-setting
26006 // operators, in which case doing the replacement here becomes redundant.
26007 if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
26008 isNullConstant(Cond.getOperand(1))) {
26009 SDValue Sub = Cond.getOperand(0);
26011 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
26012 if (Sub.getOpcode() == ISD::SUB &&
26013 (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
26014 CC == AArch64CC::PL)) {
26015 SDLoc DL(N);
26016 SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
26017 Sub.getOperand(0), Sub.getOperand(1));
26018 DCI.CombineTo(Sub.getNode(), Subs);
26019 DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
26020 return SDValue(N, 0);
26021 }
26022 }
26023
26024 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
26025 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
26026 return CondLast;
26027
26028 return performCONDCombine(N, DCI, DAG, 2, 3);
26029}
26030
26031// Try to re-use an already extended operand of a vector SetCC feeding a
26032// extended select. Doing so avoids requiring another full extension of the
26033// SET_CC result when lowering the select.
26035 EVT Op0MVT = Op->getOperand(0).getValueType();
26036 if (!Op0MVT.isVector() || Op->use_empty())
26037 return SDValue();
26038
26039 // Make sure that all uses of Op are VSELECTs with result matching types where
26040 // the result type has a larger element type than the SetCC operand.
26041 SDNode *FirstUse = *Op->user_begin();
26042 if (FirstUse->getOpcode() != ISD::VSELECT)
26043 return SDValue();
26044 EVT UseMVT = FirstUse->getValueType(0);
26045 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
26046 return SDValue();
26047 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
26048 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
26049 }))
26050 return SDValue();
26051
26052 APInt V;
26053 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
26054 return SDValue();
26055
26056 SDLoc DL(Op);
26057 SDValue Op0ExtV;
26058 SDValue Op1ExtV;
26059 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
26060 // Check if the first operand of the SET_CC is already extended. If it is,
26061 // split the SET_CC and re-use the extended version of the operand.
26062 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
26063 Op->getOperand(0));
26064 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
26065 Op->getOperand(0));
26066 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26067 Op0ExtV = SDValue(Op0SExt, 0);
26068 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
26069 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26070 Op0ExtV = SDValue(Op0ZExt, 0);
26071 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
26072 } else
26073 return SDValue();
26074
26075 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
26076 Op0ExtV, Op1ExtV, Op->getOperand(2));
26077}
26078
26079static SDValue
26081 SelectionDAG &DAG) {
26082 SDValue Vec = N->getOperand(0);
26083 if (DCI.isBeforeLegalize() &&
26084 Vec.getValueType().getVectorElementType() == MVT::i1 &&
26087 SDLoc DL(N);
26088 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
26089 DAG);
26090 }
26091
26092 return SDValue();
26093}
26094
26097 SelectionDAG &DAG) {
26098 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
26099 SDValue LHS = N->getOperand(0);
26100 SDValue RHS = N->getOperand(1);
26101 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
26102 SDLoc DL(N);
26103 EVT VT = N->getValueType(0);
26104
26105 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
26106 return V;
26107
26108 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
26109 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
26110 LHS->getOpcode() == AArch64ISD::CSEL &&
26111 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
26112 LHS->hasOneUse()) {
26113 // Invert CSEL's condition.
26114 auto OldCond =
26115 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
26116 auto NewCond = getInvertedCondCode(OldCond);
26117
26118 // csel 0, 1, !cond, X
26119 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
26120 LHS.getOperand(0), LHS.getOperand(1),
26121 getCondCode(DAG, NewCond), LHS.getOperand(3));
26122 return DAG.getZExtOrTrunc(CSEL, DL, VT);
26123 }
26124
26125 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
26126 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
26127 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
26128 LHS->hasOneUse()) {
26129 EVT TstVT = LHS->getValueType(0);
26130 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
26131 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
26132 // this pattern will get better opt in emitComparison
26133 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
26134 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
26135 DAG.getSignedConstant(TstImm, DL, TstVT));
26136 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
26137 }
26138 }
26139
26140 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
26141 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
26142 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
26143 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
26144 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
26145 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
26147 LHS->getOpcode() == ISD::BITCAST) {
26148 EVT ToVT = LHS->getValueType(0);
26149 EVT FromVT = LHS->getOperand(0).getValueType();
26150 if (FromVT.isFixedLengthVector() &&
26151 FromVT.getVectorElementType() == MVT::i1) {
26152 bool IsNull = isNullConstant(RHS);
26153 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
26154 DL, MVT::i1, LHS->getOperand(0));
26155 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
26156 LHS);
26157 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
26158 }
26159 }
26160
26161 // Try to perform the memcmp when the result is tested for [in]equality with 0
26162 if (SDValue V = performOrXorChainCombine(N, DAG))
26163 return V;
26164
26165 EVT CmpVT = LHS.getValueType();
26166
26167 // NOTE: This exists as a combine only because it proved too awkward to match
26168 // splat(1) across all the NEON types during isel.
26169 APInt SplatLHSVal;
26170 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
26171 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
26172 SplatLHSVal.isOne())
26173 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
26174
26175 return SDValue();
26176}
26177
26178// Replace a flag-setting operator (eg ANDS) with the generic version
26179// (eg AND) if the flag is unused.
26182 unsigned GenericOpcode) {
26183 SDLoc DL(N);
26184 SDValue LHS = N->getOperand(0);
26185 SDValue RHS = N->getOperand(1);
26186 EVT VT = N->getValueType(0);
26187
26188 // If the flag result isn't used, convert back to a generic opcode.
26189 if (!N->hasAnyUseOfValue(1)) {
26190 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
26191 return DCI.CombineTo(N, Res, SDValue(N, 1));
26192 }
26193
26194 // Combine identical generic nodes into this node, re-using the result.
26195 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
26196 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
26197 DCI.CombineTo(Generic, SDValue(N, 0));
26198
26199 return SDValue();
26200}
26201
26203 // setcc_merge_zero pred
26204 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
26205 // => extract_subvector (inner setcc_merge_zero)
26206 SDValue Pred = N->getOperand(0);
26207 SDValue LHS = N->getOperand(1);
26208 SDValue RHS = N->getOperand(2);
26209 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26210
26211 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
26212 LHS->getOpcode() != ISD::SIGN_EXTEND)
26213 return SDValue();
26214
26215 SDValue Extract = LHS->getOperand(0);
26216 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26217 Extract->getValueType(0) != N->getValueType(0) ||
26218 Extract->getConstantOperandVal(1) != 0)
26219 return SDValue();
26220
26221 SDValue InnerSetCC = Extract->getOperand(0);
26222 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
26223 return SDValue();
26224
26225 // By this point we've effectively got
26226 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
26227 // lanes are already zero then the trunc(sext()) sequence is redundant and we
26228 // can operate on A directly.
26229 SDValue InnerPred = InnerSetCC.getOperand(0);
26230 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
26231 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
26232 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
26233 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
26234 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
26235 return Extract;
26236
26237 return SDValue();
26238}
26239
26240static bool isSignExtInReg(const SDValue &V) {
26241 if (V.getOpcode() != AArch64ISD::VASHR ||
26242 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
26243 return false;
26244
26245 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
26246 unsigned ShiftAmtR = V.getConstantOperandVal(1);
26247 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
26248 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
26249}
26250
26251static SDValue
26253 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26254 "Unexpected opcode!");
26255
26256 SelectionDAG &DAG = DCI.DAG;
26257 SDValue Pred = N->getOperand(0);
26258 SDValue LHS = N->getOperand(1);
26259 SDValue RHS = N->getOperand(2);
26260 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26261
26262 if (SDValue V = performSetCCPunpkCombine(N, DAG))
26263 return V;
26264
26265 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26266 LHS->getOpcode() == ISD::SIGN_EXTEND &&
26267 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
26268 // setcc_merge_zero(
26269 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
26270 // => setcc_merge_zero(pred, ...)
26271 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26272 LHS->getOperand(0)->getOperand(0) == Pred)
26273 return LHS->getOperand(0);
26274
26275 // setcc_merge_zero(
26276 // all_active, extend(nxvNi1 ...), != splat(0))
26277 // -> nxvNi1 ...
26278 if (isAllActivePredicate(DAG, Pred))
26279 return LHS->getOperand(0);
26280
26281 // setcc_merge_zero(
26282 // pred, extend(nxvNi1 ...), != splat(0))
26283 // -> nxvNi1 and(pred, ...)
26284 if (DCI.isAfterLegalizeDAG())
26285 // Do this after legalization to allow more folds on setcc_merge_zero
26286 // to be recognized.
26287 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
26288 LHS->getOperand(0), Pred);
26289 }
26290
26291 // setcc_merge_zero(
26292 // pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
26293 // => setcc_merge_zero(
26294 // pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
26295 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26296 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
26297 SDValue L0 = LHS->getOperand(0);
26298 SDValue L1 = LHS->getOperand(1);
26299 SDValue L2 = LHS->getOperand(2);
26300
26301 if (L0.getOpcode() == ISD::UNDEF && isNullConstant(L2) &&
26302 isSignExtInReg(L1)) {
26303 SDLoc DL(N);
26304 SDValue Shl = L1.getOperand(0);
26306 LHS.getValueType(), L0, Shl, L2);
26307 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
26308 Pred, NewLHS, RHS, N->getOperand(3));
26309 }
26310 }
26311
26312 return SDValue();
26313}
26314
26315// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
26316// as well as whether the test should be inverted. This code is required to
26317// catch these cases (as opposed to standard dag combines) because
26318// AArch64ISD::TBZ is matched during legalization.
26319static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
26320 SelectionDAG &DAG) {
26321
26322 if (!Op->hasOneUse())
26323 return Op;
26324
26325 // We don't handle undef/constant-fold cases below, as they should have
26326 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
26327 // etc.)
26328
26329 // (tbz (trunc x), b) -> (tbz x, b)
26330 // This case is just here to enable more of the below cases to be caught.
26331 if (Op->getOpcode() == ISD::TRUNCATE &&
26332 Bit < Op->getValueType(0).getSizeInBits()) {
26333 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26334 }
26335
26336 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
26337 if (Op->getOpcode() == ISD::ANY_EXTEND &&
26338 Bit < Op->getOperand(0).getValueSizeInBits()) {
26339 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26340 }
26341
26342 if (Op->getNumOperands() != 2)
26343 return Op;
26344
26345 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
26346 if (!C)
26347 return Op;
26348
26349 switch (Op->getOpcode()) {
26350 default:
26351 return Op;
26352
26353 // (tbz (and x, m), b) -> (tbz x, b)
26354 case ISD::AND:
26355 if ((C->getZExtValue() >> Bit) & 1)
26356 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26357 return Op;
26358
26359 // (tbz (shl x, c), b) -> (tbz x, b-c)
26360 case ISD::SHL:
26361 if (C->getZExtValue() <= Bit &&
26362 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26363 Bit = Bit - C->getZExtValue();
26364 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26365 }
26366 return Op;
26367
26368 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
26369 case ISD::SRA:
26370 Bit = Bit + C->getZExtValue();
26371 if (Bit >= Op->getValueType(0).getSizeInBits())
26372 Bit = Op->getValueType(0).getSizeInBits() - 1;
26373 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26374
26375 // (tbz (srl x, c), b) -> (tbz x, b+c)
26376 case ISD::SRL:
26377 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26378 Bit = Bit + C->getZExtValue();
26379 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26380 }
26381 return Op;
26382
26383 // (tbz (xor x, -1), b) -> (tbnz x, b)
26384 case ISD::XOR:
26385 if ((C->getZExtValue() >> Bit) & 1)
26386 Invert = !Invert;
26387 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26388 }
26389}
26390
26391// Optimize test single bit zero/non-zero and branch.
26394 SelectionDAG &DAG) {
26395 unsigned Bit = N->getConstantOperandVal(2);
26396 bool Invert = false;
26397 SDValue TestSrc = N->getOperand(1);
26398 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
26399
26400 if (TestSrc == NewTestSrc)
26401 return SDValue();
26402
26403 unsigned NewOpc = N->getOpcode();
26404 if (Invert) {
26405 if (NewOpc == AArch64ISD::TBZ)
26406 NewOpc = AArch64ISD::TBNZ;
26407 else {
26408 assert(NewOpc == AArch64ISD::TBNZ);
26409 NewOpc = AArch64ISD::TBZ;
26410 }
26411 }
26412
26413 SDLoc DL(N);
26414 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
26415 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
26416}
26417
26418// Swap vselect operands where it may allow a predicated operation to achieve
26419// the `sel`.
26420//
26421// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
26422// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
26424 auto SelectA = N->getOperand(1);
26425 auto SelectB = N->getOperand(2);
26426 auto NTy = N->getValueType(0);
26427
26428 if (!NTy.isScalableVector())
26429 return SDValue();
26430 SDValue SetCC = N->getOperand(0);
26431 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
26432 return SDValue();
26433
26434 switch (SelectB.getOpcode()) {
26435 default:
26436 return SDValue();
26437 case ISD::FMUL:
26438 case ISD::FSUB:
26439 case ISD::FADD:
26440 break;
26441 }
26442 if (SelectA != SelectB.getOperand(0))
26443 return SDValue();
26444
26445 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
26446 ISD::CondCode InverseCC =
26448 auto InverseSetCC =
26449 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
26450 SetCC.getOperand(1), InverseCC);
26451
26452 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
26453 {InverseSetCC, SelectB, SelectA});
26454}
26455
26456// vselect (v1i1 setcc) ->
26457// vselect (v1iXX setcc) (XX is the size of the compared operand type)
26458// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
26459// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
26460// such VSELECT.
26462 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
26463 return SwapResult;
26464
26465 SDValue N0 = N->getOperand(0);
26466 SDValue IfTrue = N->getOperand(1);
26467 SDValue IfFalse = N->getOperand(2);
26468 EVT ResVT = N->getValueType(0);
26469 EVT CCVT = N0.getValueType();
26470
26471 if (isAllActivePredicate(DAG, N0))
26472 return N->getOperand(1);
26473
26474 if (isAllInactivePredicate(N0))
26475 return N->getOperand(2);
26476
26477 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
26478 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
26479 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
26480 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
26481 // -> merge_pasthru_op A, B,{Bn,} C
26482 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
26483 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
26484 IfTrue->getOperand(0) == N0) {
26486 Ops[0] = N0;
26487 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
26488
26489 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
26490 }
26491 }
26492
26493 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
26494 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
26495 // supported types.
26496 SDValue SetCC = N->getOperand(0);
26497 if (SetCC.getOpcode() == ISD::SETCC &&
26498 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
26499 SDValue CmpLHS = SetCC.getOperand(0);
26500 EVT VT = CmpLHS.getValueType();
26501 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
26502 SDNode *SplatLHS = N->getOperand(1).getNode();
26503 SDNode *SplatRHS = N->getOperand(2).getNode();
26504 APInt SplatLHSVal;
26505 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
26506 VT.isSimple() &&
26507 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
26508 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
26509 VT.getSimpleVT().SimpleTy) &&
26510 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
26511 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
26513 unsigned NumElts = VT.getVectorNumElements();
26515 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
26516 VT.getScalarType()));
26517 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
26518
26519 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
26520 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
26521 return Or;
26522 }
26523 }
26524
26525 EVT CmpVT = N0.getOperand(0).getValueType();
26526 if (N0.getOpcode() != ISD::SETCC ||
26528 CCVT.getVectorElementType() != MVT::i1 ||
26530 return SDValue();
26531
26532 // Only combine when the result type is of the same size as the compared
26533 // operands.
26534 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
26535 return SDValue();
26536
26537 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
26538 N0.getOperand(0), N0.getOperand(1),
26539 cast<CondCodeSDNode>(N0.getOperand(2))->get());
26540 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
26541 IfTrue, IfFalse);
26542}
26543
26544/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
26545/// the compare-mask instructions rather than going via NZCV, even if LHS and
26546/// RHS are really scalar. This replaces any scalar setcc in the above pattern
26547/// with a vector one followed by a DUP shuffle on the result.
26550 SelectionDAG &DAG = DCI.DAG;
26551 SDValue N0 = N->getOperand(0);
26552 EVT ResVT = N->getValueType(0);
26553
26554 if (N0.getOpcode() != ISD::SETCC)
26555 return SDValue();
26556
26557 if (ResVT.isScalableVT())
26558 return SDValue();
26559
26560 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
26561 // scalar SetCCResultType. We also don't expect vectors, because we assume
26562 // that selects fed by vector SETCCs are canonicalized to VSELECT.
26563 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
26564 "Scalar-SETCC feeding SELECT has unexpected result type!");
26565
26566 // If NumMaskElts == 0, the comparison is larger than select result. The
26567 // largest real NEON comparison is 64-bits per lane, which means the result is
26568 // at most 32-bits and an illegal vector. Just bail out for now.
26569 EVT SrcVT = N0.getOperand(0).getValueType();
26570
26571 // Don't try to do this optimization when the setcc itself has i1 operands.
26572 // There are no legal vectors of i1, so this would be pointless. v1f16 is
26573 // ruled out to prevent the creation of setcc that need to be scalarized.
26574 if (SrcVT == MVT::i1 ||
26575 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
26576 return SDValue();
26577
26578 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
26579 if (!ResVT.isVector() || NumMaskElts == 0)
26580 return SDValue();
26581
26582 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
26584
26585 // Also bail out if the vector CCVT isn't the same size as ResVT.
26586 // This can happen if the SETCC operand size doesn't divide the ResVT size
26587 // (e.g., f64 vs v3f32).
26588 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
26589 return SDValue();
26590
26591 // Make sure we didn't create illegal types, if we're not supposed to.
26592 assert(DCI.isBeforeLegalize() ||
26593 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
26594
26595 // First perform a vector comparison, where lane 0 is the one we're interested
26596 // in.
26597 SDLoc DL(N0);
26598 SDValue LHS =
26599 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
26600 SDValue RHS =
26601 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
26602 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
26603
26604 // Now duplicate the comparison mask we want across all other lanes.
26605 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
26606 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
26607 Mask = DAG.getNode(ISD::BITCAST, DL,
26608 ResVT.changeVectorElementTypeToInteger(), Mask);
26609
26610 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
26611}
26612
26615 EVT VT = N->getValueType(0);
26616 SDLoc DL(N);
26617 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
26618 // 128bit vector version.
26619 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
26621 SmallVector<SDValue> Ops(N->ops());
26622 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
26623 DCI.DAG.getVTList(LVT), Ops)) {
26624 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
26625 DCI.DAG.getConstant(0, DL, MVT::i64));
26626 }
26627 }
26628
26629 if (N->getOpcode() == AArch64ISD::DUP) {
26630 // If the instruction is known to produce a scalar in SIMD registers, we can
26631 // duplicate it across the vector lanes using DUPLANE instead of moving it
26632 // to a GPR first. For example, this allows us to handle:
26633 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
26634 SDValue Op = N->getOperand(0);
26635 // FIXME: Ideally, we should be able to handle all instructions that
26636 // produce a scalar value in FPRs.
26637 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
26638 Op.getOpcode() == AArch64ISD::FCMGE ||
26639 Op.getOpcode() == AArch64ISD::FCMGT) {
26640 EVT ElemVT = VT.getVectorElementType();
26641 EVT ExpandedVT = VT;
26642 // Insert into a 128-bit vector to match DUPLANE's pattern.
26643 if (VT.getSizeInBits() != 128)
26644 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26645 128 / ElemVT.getSizeInBits());
26646 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
26647 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
26648 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
26649 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
26650 }
26651
26652 if (DCI.isAfterLegalizeDAG()) {
26653 // If scalar dup's operand is extract_vector_elt, try to combine them into
26654 // duplane. For example,
26655 //
26656 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
26657 // t18: v4i32 = AArch64ISD::DUP t21
26658 // ==>
26659 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
26660 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
26661 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26662 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
26663 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
26664 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
26665 EXTRACT_VEC_ELT.getOperand(1));
26666 }
26667 }
26668 }
26669
26670 return performPostLD1Combine(N, DCI, false);
26671 }
26672
26673 return SDValue();
26674}
26675
26676/// Get rid of unnecessary NVCASTs (that don't change the type).
26678 if (N->getValueType(0) == N->getOperand(0).getValueType())
26679 return N->getOperand(0);
26680 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
26681 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
26682 N->getOperand(0).getOperand(0));
26683
26684 return SDValue();
26685}
26686
26687// If all users of the globaladdr are of the form (globaladdr + constant), find
26688// the smallest constant, fold it into the globaladdr's offset and rewrite the
26689// globaladdr as (globaladdr + constant) - constant.
26691 const AArch64Subtarget *Subtarget,
26692 const TargetMachine &TM) {
26693 auto *GN = cast<GlobalAddressSDNode>(N);
26694 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
26696 return SDValue();
26697
26698 uint64_t MinOffset = -1ull;
26699 for (SDNode *N : GN->users()) {
26700 if (N->getOpcode() != ISD::ADD)
26701 return SDValue();
26702 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
26703 if (!C)
26704 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
26705 if (!C)
26706 return SDValue();
26707 MinOffset = std::min(MinOffset, C->getZExtValue());
26708 }
26709 uint64_t Offset = MinOffset + GN->getOffset();
26710
26711 // Require that the new offset is larger than the existing one. Otherwise, we
26712 // can end up oscillating between two possible DAGs, for example,
26713 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
26714 if (Offset <= uint64_t(GN->getOffset()))
26715 return SDValue();
26716
26717 // Check whether folding this offset is legal. It must not go out of bounds of
26718 // the referenced object to avoid violating the code model, and must be
26719 // smaller than 2^20 because this is the largest offset expressible in all
26720 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
26721 // stores an immediate signed 21 bit offset.)
26722 //
26723 // This check also prevents us from folding negative offsets, which will end
26724 // up being treated in the same way as large positive ones. They could also
26725 // cause code model violations, and aren't really common enough to matter.
26726 if (Offset >= (1 << 20))
26727 return SDValue();
26728
26729 const GlobalValue *GV = GN->getGlobal();
26730 Type *T = GV->getValueType();
26731 if (!T->isSized() ||
26733 return SDValue();
26734
26735 SDLoc DL(GN);
26736 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
26737 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
26738 DAG.getConstant(MinOffset, DL, MVT::i64));
26739}
26740
26742 const AArch64Subtarget *Subtarget) {
26743 SDValue BR = N->getOperand(0);
26744 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
26746 return SDValue();
26747
26748 SDLoc DL(N);
26749 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
26750}
26751
26752// Turns the vector of indices into a vector of byte offstes by scaling Offset
26753// by (BitWidth / 8).
26755 SDLoc DL, unsigned BitWidth) {
26756 assert(Offset.getValueType().isScalableVector() &&
26757 "This method is only for scalable vectors of offsets");
26758
26759 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
26760 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
26761
26762 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
26763}
26764
26765/// Check if the value of \p OffsetInBytes can be used as an immediate for
26766/// the gather load/prefetch and scatter store instructions with vector base and
26767/// immediate offset addressing mode:
26768///
26769/// [<Zn>.[S|D]{, #<imm>}]
26770///
26771/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26772inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
26773 unsigned ScalarSizeInBytes) {
26774 // The immediate is not a multiple of the scalar size.
26775 if (OffsetInBytes % ScalarSizeInBytes)
26776 return false;
26777
26778 // The immediate is out of range.
26779 if (OffsetInBytes / ScalarSizeInBytes > 31)
26780 return false;
26781
26782 return true;
26783}
26784
26785/// Check if the value of \p Offset represents a valid immediate for the SVE
26786/// gather load/prefetch and scatter store instructiona with vector base and
26787/// immediate offset addressing mode:
26788///
26789/// [<Zn>.[S|D]{, #<imm>}]
26790///
26791/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26793 unsigned ScalarSizeInBytes) {
26794 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
26795 return OffsetConst && isValidImmForSVEVecImmAddrMode(
26796 OffsetConst->getZExtValue(), ScalarSizeInBytes);
26797}
26798
26800 unsigned Opcode,
26801 bool OnlyPackedOffsets = true) {
26802 const SDValue Src = N->getOperand(2);
26803 const EVT SrcVT = Src->getValueType(0);
26804 assert(SrcVT.isScalableVector() &&
26805 "Scatter stores are only possible for SVE vectors");
26806
26807 SDLoc DL(N);
26808 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
26809
26810 // Make sure that source data will fit into an SVE register
26812 return SDValue();
26813
26814 // For FPs, ACLE only supports _packed_ single and double precision types.
26815 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
26816 if (SrcElVT.isFloatingPoint())
26817 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
26818 ((Opcode != AArch64ISD::SST1Q_PRED &&
26819 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
26820 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
26821 return SDValue();
26822
26823 // Depending on the addressing mode, this is either a pointer or a vector of
26824 // pointers (that fits into one register)
26825 SDValue Base = N->getOperand(4);
26826 // Depending on the addressing mode, this is either a single offset or a
26827 // vector of offsets (that fits into one register)
26828 SDValue Offset = N->getOperand(5);
26829
26830 // For "scalar + vector of indices", just scale the indices. This only
26831 // applies to non-temporal scatters because there's no instruction that takes
26832 // indices.
26833 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
26834 Offset =
26836 Opcode = AArch64ISD::SSTNT1_PRED;
26837 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
26838 Offset =
26840 Opcode = AArch64ISD::SST1Q_PRED;
26841 }
26842
26843 // In the case of non-temporal gather loads there's only one SVE instruction
26844 // per data-size: "scalar + vector", i.e.
26845 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26846 // Since we do have intrinsics that allow the arguments to be in a different
26847 // order, we may need to swap them to match the spec.
26848 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
26849 Offset.getValueType().isVector())
26851
26852 // SST1_IMM requires that the offset is an immediate that is:
26853 // * a multiple of #SizeInBytes,
26854 // * in the range [0, 31 x #SizeInBytes],
26855 // where #SizeInBytes is the size in bytes of the stored items. For
26856 // immediates outside that range and non-immediate scalar offsets use SST1 or
26857 // SST1_UXTW instead.
26858 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
26860 SrcVT.getScalarSizeInBits() / 8)) {
26861 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26862 Opcode = AArch64ISD::SST1_UXTW_PRED;
26863 else
26864 Opcode = AArch64ISD::SST1_PRED;
26865
26867 }
26868 }
26869
26870 auto &TLI = DAG.getTargetLoweringInfo();
26871 if (!TLI.isTypeLegal(Base.getValueType()))
26872 return SDValue();
26873
26874 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
26875 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26876 // nxv2i64. Legalize accordingly.
26877 if (!OnlyPackedOffsets &&
26878 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26879 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26880
26881 if (!TLI.isTypeLegal(Offset.getValueType()))
26882 return SDValue();
26883
26884 // Source value type that is representable in hardware
26885 EVT HwSrcVt = getSVEContainerType(SrcVT);
26886
26887 // Keep the original type of the input data to store - this is needed to be
26888 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
26889 // FP values we want the integer equivalent, so just use HwSrcVt.
26890 SDValue InputVT = DAG.getValueType(SrcVT);
26891 if (SrcVT.isFloatingPoint())
26892 InputVT = DAG.getValueType(HwSrcVt);
26893
26894 SDVTList VTs = DAG.getVTList(MVT::Other);
26895 SDValue SrcNew;
26896
26897 if (Src.getValueType().isFloatingPoint())
26898 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
26899 else
26900 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
26901
26902 SDValue Ops[] = {N->getOperand(0), // Chain
26903 SrcNew,
26904 N->getOperand(3), // Pg
26905 Base,
26906 Offset,
26907 InputVT};
26908
26909 return DAG.getNode(Opcode, DL, VTs, Ops);
26910}
26911
26913 unsigned Opcode,
26914 bool OnlyPackedOffsets = true) {
26915 const EVT RetVT = N->getValueType(0);
26916 assert(RetVT.isScalableVector() &&
26917 "Gather loads are only possible for SVE vectors");
26918
26919 SDLoc DL(N);
26920
26921 // Make sure that the loaded data will fit into an SVE register
26923 return SDValue();
26924
26925 // Depending on the addressing mode, this is either a pointer or a vector of
26926 // pointers (that fits into one register)
26927 SDValue Base = N->getOperand(3);
26928 // Depending on the addressing mode, this is either a single offset or a
26929 // vector of offsets (that fits into one register)
26930 SDValue Offset = N->getOperand(4);
26931
26932 // For "scalar + vector of indices", scale the indices to obtain unscaled
26933 // offsets. This applies to non-temporal and quadword gathers, which do not
26934 // have an addressing mode with scaled offset.
26935 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
26937 RetVT.getScalarSizeInBits());
26938 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
26939 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
26941 RetVT.getScalarSizeInBits());
26942 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
26943 }
26944
26945 // In the case of non-temporal gather loads and quadword gather loads there's
26946 // only one addressing mode : "vector + scalar", e.g.
26947 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26948 // Since we do have intrinsics that allow the arguments to be in a different
26949 // order, we may need to swap them to match the spec.
26950 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
26951 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
26952 Offset.getValueType().isVector())
26954
26955 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26956 // * a multiple of #SizeInBytes,
26957 // * in the range [0, 31 x #SizeInBytes],
26958 // where #SizeInBytes is the size in bytes of the loaded items. For
26959 // immediates outside that range and non-immediate scalar offsets use
26960 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26961 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26962 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26964 RetVT.getScalarSizeInBits() / 8)) {
26965 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26966 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26967 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26968 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26969 else
26970 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26971 ? AArch64ISD::GLD1_MERGE_ZERO
26972 : AArch64ISD::GLDFF1_MERGE_ZERO;
26973
26975 }
26976 }
26977
26978 auto &TLI = DAG.getTargetLoweringInfo();
26979 if (!TLI.isTypeLegal(Base.getValueType()))
26980 return SDValue();
26981
26982 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26983 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26984 // nxv2i64. Legalize accordingly.
26985 if (!OnlyPackedOffsets &&
26986 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26987 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26988
26989 // Return value type that is representable in hardware
26990 EVT HwRetVt = getSVEContainerType(RetVT);
26991
26992 // Keep the original output value type around - this is needed to be able to
26993 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26994 // values we want the integer equivalent, so just use HwRetVT.
26995 SDValue OutVT = DAG.getValueType(RetVT);
26996 if (RetVT.isFloatingPoint())
26997 OutVT = DAG.getValueType(HwRetVt);
26998
26999 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
27000 SDValue Ops[] = {N->getOperand(0), // Chain
27001 N->getOperand(2), // Pg
27002 Base, Offset, OutVT};
27003
27004 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
27005 SDValue LoadChain = SDValue(Load.getNode(), 1);
27006
27007 if (RetVT.isInteger() && (RetVT != HwRetVt))
27008 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
27009
27010 // If the original return value was FP, bitcast accordingly. Doing it here
27011 // means that we can avoid adding TableGen patterns for FPs.
27012 if (RetVT.isFloatingPoint())
27013 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
27014
27015 return DAG.getMergeValues({Load, LoadChain}, DL);
27016}
27017
27018static SDValue
27020 SelectionDAG &DAG) {
27021 SDLoc DL(N);
27022 SDValue Src = N->getOperand(0);
27023 unsigned Opc = Src->getOpcode();
27024
27025 // Sign extend of an unsigned unpack -> signed unpack
27026 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
27027
27028 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
27029 : AArch64ISD::SUNPKLO;
27030
27031 // Push the sign extend to the operand of the unpack
27032 // This is necessary where, for example, the operand of the unpack
27033 // is another unpack:
27034 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
27035 // ->
27036 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
27037 // ->
27038 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
27039 SDValue ExtOp = Src->getOperand(0);
27040 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
27041 EVT EltTy = VT.getVectorElementType();
27042 (void)EltTy;
27043
27044 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
27045 "Sign extending from an invalid type");
27046
27047 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
27048
27050 ExtOp, DAG.getValueType(ExtVT));
27051
27052 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
27053 }
27054
27055 // Sign extend of CSET -> CSETM.
27056 if (Opc == AArch64ISD::CSEL &&
27057 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
27058 EVT VT = N->getValueType(0);
27059 SDValue TVal = Src.getOperand(0);
27060 SDValue FVal = Src.getOperand(1);
27061
27062 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
27063 if (isNullConstant(TVal) && isOneConstant(FVal))
27064 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
27065 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
27066 Src.getOperand(3));
27067
27068 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
27069 if (isOneConstant(TVal) && isNullConstant(FVal))
27070 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
27071 DAG.getAllOnesConstant(DL, VT), FVal,
27072 Src.getOperand(2), Src.getOperand(3));
27073 }
27074
27075 if (DCI.isBeforeLegalizeOps())
27076 return SDValue();
27077
27079 return SDValue();
27080
27081 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
27082 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
27083 unsigned NewOpc;
27084 unsigned MemVTOpNum = 4;
27085 switch (Opc) {
27086 case AArch64ISD::LD1_MERGE_ZERO:
27087 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
27088 MemVTOpNum = 3;
27089 break;
27090 case AArch64ISD::LDNF1_MERGE_ZERO:
27091 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
27092 MemVTOpNum = 3;
27093 break;
27094 case AArch64ISD::LDFF1_MERGE_ZERO:
27095 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
27096 MemVTOpNum = 3;
27097 break;
27098 case AArch64ISD::GLD1_MERGE_ZERO:
27099 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
27100 break;
27101 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27102 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
27103 break;
27104 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27105 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
27106 break;
27107 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27108 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
27109 break;
27110 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27111 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
27112 break;
27113 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27114 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
27115 break;
27116 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27117 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
27118 break;
27119 case AArch64ISD::GLDFF1_MERGE_ZERO:
27120 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
27121 break;
27122 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
27123 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
27124 break;
27125 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
27126 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
27127 break;
27128 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
27129 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
27130 break;
27131 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
27132 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
27133 break;
27134 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
27135 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
27136 break;
27137 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
27138 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
27139 break;
27140 case AArch64ISD::GLDNT1_MERGE_ZERO:
27141 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
27142 break;
27143 default:
27144 return SDValue();
27145 }
27146
27147 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
27148 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
27149
27150 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
27151 return SDValue();
27152
27153 EVT DstVT = N->getValueType(0);
27154 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
27155
27157 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
27158 Ops.push_back(Src->getOperand(I));
27159
27160 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
27161 DCI.CombineTo(N, ExtLoad);
27162 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
27163
27164 // Return N so it doesn't get rechecked
27165 return SDValue(N, 0);
27166}
27167
27168/// Legalize the gather prefetch (scalar + vector addressing mode) when the
27169/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
27170/// != nxv2i32) do not need legalization.
27172 const unsigned OffsetPos = 4;
27173 SDValue Offset = N->getOperand(OffsetPos);
27174
27175 // Not an unpacked vector, bail out.
27176 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
27177 return SDValue();
27178
27179 // Extend the unpacked offset vector to 64-bit lanes.
27180 SDLoc DL(N);
27181 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
27183 // Replace the offset operand with the 64-bit one.
27184 Ops[OffsetPos] = Offset;
27185
27186 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27187}
27188
27189/// Combines a node carrying the intrinsic
27190/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
27191/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
27192/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
27193/// sve gather prefetch instruction with vector plus immediate addressing mode.
27195 unsigned ScalarSizeInBytes) {
27196 const unsigned ImmPos = 4, OffsetPos = 3;
27197 // No need to combine the node if the immediate is valid...
27198 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
27199 return SDValue();
27200
27201 // ...otherwise swap the offset base with the offset...
27203 std::swap(Ops[ImmPos], Ops[OffsetPos]);
27204 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
27205 // `aarch64_sve_prfb_gather_uxtw_index`.
27206 SDLoc DL(N);
27207 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
27208 MVT::i64);
27209
27210 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27211}
27212
27213// Return true if the vector operation can guarantee only the first lane of its
27214// result contains data, with all bits in other lanes set to zero.
27216 switch (Op.getOpcode()) {
27217 default:
27218 return false;
27219 case AArch64ISD::ANDV_PRED:
27220 case AArch64ISD::EORV_PRED:
27221 case AArch64ISD::FADDA_PRED:
27222 case AArch64ISD::FADDV_PRED:
27223 case AArch64ISD::FMAXNMV_PRED:
27224 case AArch64ISD::FMAXV_PRED:
27225 case AArch64ISD::FMINNMV_PRED:
27226 case AArch64ISD::FMINV_PRED:
27227 case AArch64ISD::ORV_PRED:
27228 case AArch64ISD::SADDV_PRED:
27229 case AArch64ISD::SMAXV_PRED:
27230 case AArch64ISD::SMINV_PRED:
27231 case AArch64ISD::UADDV_PRED:
27232 case AArch64ISD::UMAXV_PRED:
27233 case AArch64ISD::UMINV_PRED:
27234 return true;
27235 }
27236}
27237
27238// Return true if the vector operation can guarantee that the first lane of its
27239// result is active.
27241 switch (Op.getOpcode()) {
27242 default:
27243 return false;
27244 case AArch64ISD::REINTERPRET_CAST:
27245 return isLane0KnownActive(Op->getOperand(0));
27246 case ISD::SPLAT_VECTOR:
27247 return isOneConstant(Op.getOperand(0));
27248 case AArch64ISD::PTRUE:
27249 return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
27250 };
27251}
27252
27254 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
27255 SDValue InsertVec = N->getOperand(0);
27256 SDValue InsertElt = N->getOperand(1);
27257 SDValue InsertIdx = N->getOperand(2);
27258
27259 // We only care about inserts into the first element...
27260 if (!isNullConstant(InsertIdx))
27261 return SDValue();
27262 // ...of a zero'd vector...
27264 return SDValue();
27265 // ...where the inserted data was previously extracted...
27266 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
27267 return SDValue();
27268
27269 SDValue ExtractVec = InsertElt.getOperand(0);
27270 SDValue ExtractIdx = InsertElt.getOperand(1);
27271
27272 // ...from the first element of a vector.
27273 if (!isNullConstant(ExtractIdx))
27274 return SDValue();
27275
27276 // If we get here we are effectively trying to zero lanes 1-N of a vector.
27277
27278 // Ensure there's no type conversion going on.
27279 if (N->getValueType(0) != ExtractVec.getValueType())
27280 return SDValue();
27281
27282 if (!isLanes1toNKnownZero(ExtractVec))
27283 return SDValue();
27284
27285 // The explicit zeroing is redundant.
27286 return ExtractVec;
27287}
27288
27289static SDValue
27292 return Res;
27293
27294 return performPostLD1Combine(N, DCI, true);
27295}
27296
27299 const AArch64Subtarget *Subtarget) {
27300 SDValue N0 = N->getOperand(0);
27301 EVT VT = N->getValueType(0);
27302
27303 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
27304 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
27305 return SDValue();
27306
27307 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
27308 EVT EltVT = VT.getVectorElementType();
27309 return EltVT == MVT::f32 || EltVT == MVT::f64;
27310 };
27311
27312 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
27313 // We purposefully don't care about legality of the nodes here as we know
27314 // they can be split down into something legal.
27315 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
27316 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
27317 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
27318 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
27319 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
27320 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
27321 LN0->getChain(), LN0->getBasePtr(),
27322 N0.getValueType(), LN0->getMemOperand());
27323 DCI.CombineTo(N, ExtLoad);
27324 DCI.CombineTo(
27325 N0.getNode(),
27326 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
27327 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
27328 ExtLoad.getValue(1));
27329 return SDValue(N, 0); // Return N so it doesn't get rechecked!
27330 }
27331
27332 return SDValue();
27333}
27334
27336 const AArch64Subtarget *Subtarget) {
27337 EVT VT = N->getValueType(0);
27338
27339 // Don't expand for NEON, SVE2 or SME
27340 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
27341 return SDValue();
27342
27343 SDLoc DL(N);
27344
27345 SDValue Mask = N->getOperand(0);
27346 SDValue In1 = N->getOperand(1);
27347 SDValue In2 = N->getOperand(2);
27348
27349 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
27350 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
27351 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
27352 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
27353}
27354
27356 EVT VT = N->getValueType(0);
27357
27358 SDValue Insert = N->getOperand(0);
27359 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
27360 return SDValue();
27361
27362 if (!Insert.getOperand(0).isUndef())
27363 return SDValue();
27364
27365 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
27366 uint64_t IdxDupLane = N->getConstantOperandVal(1);
27367 if (IdxInsert != 0 || IdxDupLane != 0)
27368 return SDValue();
27369
27370 SDValue Bitcast = Insert.getOperand(1);
27371 if (Bitcast.getOpcode() != ISD::BITCAST)
27372 return SDValue();
27373
27374 SDValue Subvec = Bitcast.getOperand(0);
27375 EVT SubvecVT = Subvec.getValueType();
27376 if (!SubvecVT.is128BitVector())
27377 return SDValue();
27378 EVT NewSubvecVT =
27380
27381 SDLoc DL(N);
27382 SDValue NewInsert =
27383 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
27384 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
27385 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
27386 NewInsert, N->getOperand(1));
27387 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
27388}
27389
27390// Try to combine mull with uzp1.
27393 SelectionDAG &DAG) {
27394 if (DCI.isBeforeLegalizeOps())
27395 return SDValue();
27396
27397 SDValue LHS = N->getOperand(0);
27398 SDValue RHS = N->getOperand(1);
27399
27400 SDValue ExtractHigh;
27401 SDValue ExtractLow;
27402 SDValue TruncHigh;
27403 SDValue TruncLow;
27404 SDLoc DL(N);
27405
27406 // Check the operands are trunc and extract_high.
27408 RHS.getOpcode() == ISD::TRUNCATE) {
27409 TruncHigh = RHS;
27410 if (LHS.getOpcode() == ISD::BITCAST)
27411 ExtractHigh = LHS.getOperand(0);
27412 else
27413 ExtractHigh = LHS;
27415 LHS.getOpcode() == ISD::TRUNCATE) {
27416 TruncHigh = LHS;
27417 if (RHS.getOpcode() == ISD::BITCAST)
27418 ExtractHigh = RHS.getOperand(0);
27419 else
27420 ExtractHigh = RHS;
27421 } else
27422 return SDValue();
27423
27424 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27425 // with uzp1.
27426 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27427 SDValue TruncHighOp = TruncHigh.getOperand(0);
27428 EVT TruncHighOpVT = TruncHighOp.getValueType();
27429 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
27430 DAG.isSplatValue(TruncHighOp, false))
27431 return SDValue();
27432
27433 // Check there is other extract_high with same source vector.
27434 // For example,
27435 //
27436 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
27437 // t12: v4i16 = truncate t11
27438 // t31: v4i32 = AArch64ISD::SMULL t18, t12
27439 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
27440 // t16: v4i16 = truncate t15
27441 // t30: v4i32 = AArch64ISD::SMULL t23, t1
27442 //
27443 // This dagcombine assumes the two extract_high uses same source vector in
27444 // order to detect the pair of the mull. If they have different source vector,
27445 // this code will not work.
27446 // TODO: Should also try to look through a bitcast.
27447 bool HasFoundMULLow = true;
27448 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
27449 if (ExtractHighSrcVec->use_size() != 2)
27450 HasFoundMULLow = false;
27451
27452 // Find ExtractLow.
27453 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
27454 if (User == ExtractHigh.getNode())
27455 continue;
27456
27457 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27459 HasFoundMULLow = false;
27460 break;
27461 }
27462
27463 ExtractLow.setNode(User);
27464 }
27465
27466 if (!ExtractLow || !ExtractLow->hasOneUse())
27467 HasFoundMULLow = false;
27468
27469 // Check ExtractLow's user.
27470 if (HasFoundMULLow) {
27471 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
27472 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
27473 HasFoundMULLow = false;
27474 } else {
27475 if (ExtractLowUser->getOperand(0) == ExtractLow) {
27476 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
27477 TruncLow = ExtractLowUser->getOperand(1);
27478 else
27479 HasFoundMULLow = false;
27480 } else {
27481 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
27482 TruncLow = ExtractLowUser->getOperand(0);
27483 else
27484 HasFoundMULLow = false;
27485 }
27486 }
27487 }
27488
27489 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27490 // with uzp1.
27491 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27492 EVT TruncHighVT = TruncHigh.getValueType();
27493 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27494 SDValue TruncLowOp =
27495 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
27496 EVT TruncLowOpVT = TruncLowOp.getValueType();
27497 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
27498 DAG.isSplatValue(TruncLowOp, false)))
27499 return SDValue();
27500
27501 // Create uzp1, extract_high and extract_low.
27502 if (TruncHighOpVT != UZP1VT)
27503 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
27504 if (TruncLowOpVT != UZP1VT)
27505 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
27506
27507 SDValue UZP1 =
27508 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
27509 SDValue HighIdxCst =
27510 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
27511 SDValue NewTruncHigh =
27512 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
27513 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
27514
27515 if (HasFoundMULLow) {
27516 EVT TruncLowVT = TruncLow.getValueType();
27517 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
27518 UZP1, ExtractLow.getOperand(1));
27519 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
27520 }
27521
27522 return SDValue(N, 0);
27523}
27524
27527 SelectionDAG &DAG) {
27528 if (SDValue Val =
27530 return Val;
27531
27532 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
27533 return Val;
27534
27535 return SDValue();
27536}
27537
27540 SelectionDAG &DAG) {
27541 if (DCI.isBeforeLegalize())
27542 return SDValue();
27543
27544 SDLoc DL(N);
27545 auto Mask = N->getOperand(0);
27546 auto Pred = N->getOperand(1);
27547
27548 if (!isLane0KnownActive(Mask))
27549 return SDValue();
27550
27551 if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
27552 Pred = Pred->getOperand(0);
27553
27554 if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
27555 Pred = Pred->getOperand(0);
27556 Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
27557 return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
27558 Pred);
27559 }
27560
27561 return SDValue();
27562}
27563
27564static SDValue
27566 SelectionDAG &DAG) {
27567 // Let's do below transform.
27568 //
27569 // t34: v4i32 = AArch64ISD::UADDLV t2
27570 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
27571 // t7: i64 = zero_extend t35
27572 // t20: v1i64 = scalar_to_vector t7
27573 // ==>
27574 // t34: v4i32 = AArch64ISD::UADDLV t2
27575 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
27576 // t40: v1i64 = AArch64ISD::NVCAST t39
27577 if (DCI.isBeforeLegalizeOps())
27578 return SDValue();
27579
27580 EVT VT = N->getValueType(0);
27581 if (VT != MVT::v1i64)
27582 return SDValue();
27583
27584 SDValue ZEXT = N->getOperand(0);
27585 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
27586 return SDValue();
27587
27588 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
27589 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
27590 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
27591 return SDValue();
27592
27593 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
27594 return SDValue();
27595
27596 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
27597 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
27598 UADDLV.getValueType() != MVT::v4i32 ||
27599 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
27600 return SDValue();
27601
27602 // Let's generate new sequence with AArch64ISD::NVCAST.
27603 SDLoc DL(N);
27604 SDValue EXTRACT_SUBVEC =
27605 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
27606 DAG.getConstant(0, DL, MVT::i64));
27607 SDValue NVCAST =
27608 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
27609
27610 return NVCAST;
27611}
27612
27615 if (!DCI.isBeforeLegalize())
27616 return SDValue();
27617
27618 unsigned NumParts = N->getNumOperands();
27619 if (NumParts != 2 && NumParts != 4)
27620 return SDValue();
27621
27622 EVT SubVecTy = N->getValueType(0);
27623
27624 // At the moment we're unlikely to see a fixed-width vector deinterleave as
27625 // we usually generate shuffles instead.
27626 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
27627 if (!SubVecTy.isScalableVector() ||
27628 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
27629 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
27630 return SDValue();
27631
27632 // Make sure each input operand is the correct extract_subvector of the same
27633 // wider vector.
27634 SDValue Op0 = N->getOperand(0);
27635 for (unsigned I = 0; I < NumParts; I++) {
27636 SDValue OpI = N->getOperand(I);
27637 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27638 OpI->getOperand(0) != Op0->getOperand(0))
27639 return SDValue();
27640 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
27641 return SDValue();
27642 }
27643
27644 // Normal loads are currently already handled by the InterleavedAccessPass so
27645 // we don't expect to see them here. Bail out if the masked load has an
27646 // unexpected number of uses, since we want to avoid a situation where we have
27647 // both deinterleaving loads and normal loads in the same block. Also, discard
27648 // masked loads that are extending, indexed, have an unexpected offset or have
27649 // an unsupported passthru value until we find a valid use case.
27650 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
27651 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
27652 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
27653 !MaskedLoad->getOffset().isUndef() ||
27654 (!MaskedLoad->getPassThru()->isUndef() &&
27655 !isZerosVector(MaskedLoad->getPassThru().getNode())))
27656 return SDValue();
27657
27658 // Now prove that the mask is an interleave of identical masks.
27659 SDLoc DL(N);
27660 SDValue NarrowMask =
27661 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
27662 if (!NarrowMask)
27663 return SDValue();
27664
27665 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
27666 : Intrinsic::aarch64_sve_ld4_sret;
27667 SDValue NewLdOps[] = {MaskedLoad->getChain(),
27668 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
27669 MaskedLoad->getBasePtr()};
27670 SDValue Res;
27671 if (NumParts == 2)
27673 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
27674 else
27676 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
27677 NewLdOps);
27678
27679 // We can now generate a structured load!
27680 SmallVector<SDValue, 4> ResOps(NumParts);
27681 for (unsigned Idx = 0; Idx < NumParts; Idx++)
27682 ResOps[Idx] = SDValue(Res.getNode(), Idx);
27683
27684 // Replace uses of the original chain result with the new chain result.
27685 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
27686 SDValue(Res.getNode(), NumParts));
27687 return DCI.CombineTo(N, ResOps, false);
27688}
27689
27690/// If the operand is a bitwise AND with a constant RHS, and the shift has a
27691/// constant RHS and is the only use, we can pull it out of the shift, i.e.
27692///
27693/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
27694///
27695/// We prefer this canonical form to match existing isel patterns.
27698 SelectionDAG &DAG) {
27699 if (DCI.isBeforeLegalizeOps())
27700 return SDValue();
27701
27702 SDValue Op0 = N->getOperand(0);
27703 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
27704 return SDValue();
27705
27706 SDValue C1 = Op0->getOperand(1);
27707 SDValue C2 = N->getOperand(1);
27709 return SDValue();
27710
27711 // Might be folded into shifted op, do not lower.
27712 if (N->hasOneUse()) {
27713 unsigned UseOpc = N->user_begin()->getOpcode();
27714 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
27715 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
27716 return SDValue();
27717 }
27718
27719 SDLoc DL(N);
27720 EVT VT = N->getValueType(0);
27721
27722 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
27723 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
27724 // causing infinite loop. Result may also be worse.
27725 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
27726 if (!isa<ConstantSDNode>(NewRHS))
27727 return SDValue();
27728
27729 SDValue X = Op0->getOperand(0);
27730 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
27731 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
27732}
27733
27735 unsigned IntrinsicID = N->getConstantOperandVal(1);
27736 auto Register =
27737 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
27738 : AArch64SysReg::RNDRRS);
27739 SDLoc DL(N);
27740 SDValue A = DAG.getNode(
27741 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
27742 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
27743 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
27744 DAG.getConstant(0, DL, MVT::i32),
27745 DAG.getConstant(0, DL, MVT::i32),
27746 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
27747 return DAG.getMergeValues(
27748 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
27749}
27750
27752 DAGCombinerInfo &DCI) const {
27753 SelectionDAG &DAG = DCI.DAG;
27754 switch (N->getOpcode()) {
27755 default:
27756 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
27757 break;
27759 return performVectorDeinterleaveCombine(N, DCI, DAG);
27760 case ISD::VECREDUCE_AND:
27761 case ISD::VECREDUCE_OR:
27762 case ISD::VECREDUCE_XOR:
27763 return performVecReduceBitwiseCombine(N, DCI, DAG);
27764 case ISD::ADD:
27765 case ISD::SUB:
27766 return performAddSubCombine(N, DCI);
27767 case ISD::BUILD_VECTOR:
27768 return performBuildVectorCombine(N, DCI, DAG);
27769 case ISD::SMIN:
27770 return performSMINCombine(N, DAG);
27771 case ISD::TRUNCATE:
27772 return performTruncateCombine(N, DAG, DCI);
27773 case AArch64ISD::ANDS:
27774 return performFlagSettingCombine(N, DCI, ISD::AND);
27775 case AArch64ISD::ADC:
27776 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27777 return R;
27778 return foldADCToCINC(N, DAG);
27779 case AArch64ISD::SBC:
27780 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
27781 case AArch64ISD::ADCS:
27782 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27783 return R;
27784 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
27785 case AArch64ISD::SBCS:
27786 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
27787 return R;
27788 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
27789 case AArch64ISD::ADDS:
27790 return performFlagSettingCombine(N, DCI, ISD::ADD);
27791 case AArch64ISD::SUBS:
27792 return performFlagSettingCombine(N, DCI, ISD::SUB);
27793 case AArch64ISD::BICi: {
27795 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
27796 APInt DemandedElts =
27797 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
27798
27800 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
27801 return SDValue();
27802
27803 break;
27804 }
27805 case ISD::XOR:
27806 return performXorCombine(N, DAG, DCI, Subtarget);
27807 case ISD::MUL:
27808 return performMulCombine(N, DAG, DCI, Subtarget);
27809 case ISD::SINT_TO_FP:
27810 case ISD::UINT_TO_FP:
27811 return performIntToFpCombine(N, DAG, DCI, Subtarget);
27812 case ISD::FP_TO_SINT:
27813 case ISD::FP_TO_UINT:
27816 return performFpToIntCombine(N, DAG, DCI, Subtarget);
27817 case ISD::OR:
27818 return performORCombine(N, DCI, Subtarget, *this);
27819 case ISD::AND:
27820 return performANDCombine(N, DCI);
27821 case ISD::FADD:
27822 return performFADDCombine(N, DCI);
27824 return performIntrinsicCombine(N, DCI, Subtarget);
27825 case ISD::ANY_EXTEND:
27826 case ISD::ZERO_EXTEND:
27827 case ISD::SIGN_EXTEND:
27828 return performExtendCombine(N, DCI, DAG);
27830 return performSignExtendInRegCombine(N, DCI, DAG);
27832 return performConcatVectorsCombine(N, DCI, DAG);
27834 return performExtractSubvectorCombine(N, DCI, DAG);
27836 return performInsertSubvectorCombine(N, DCI, DAG);
27837 case ISD::SELECT:
27838 return performSelectCombine(N, DCI);
27839 case ISD::VSELECT:
27840 return performVSelectCombine(N, DCI.DAG);
27841 case ISD::SETCC:
27842 return performSETCCCombine(N, DCI, DAG);
27843 case ISD::LOAD:
27844 return performLOADCombine(N, DCI, DAG, Subtarget);
27845 case ISD::STORE:
27846 return performSTORECombine(N, DCI, DAG, Subtarget);
27847 case ISD::MSTORE:
27848 return performMSTORECombine(N, DCI, DAG, Subtarget);
27849 case ISD::MGATHER:
27850 case ISD::MSCATTER:
27851 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
27852 return performMaskedGatherScatterCombine(N, DCI, DAG);
27853 case ISD::FP_EXTEND:
27854 return performFPExtendCombine(N, DAG, DCI, Subtarget);
27855 case AArch64ISD::BRCOND:
27856 return performBRCONDCombine(N, DCI, DAG);
27857 case AArch64ISD::TBNZ:
27858 case AArch64ISD::TBZ:
27859 return performTBZCombine(N, DCI, DAG);
27860 case AArch64ISD::CSEL:
27861 return performCSELCombine(N, DCI, DAG);
27862 case AArch64ISD::DUP:
27863 case AArch64ISD::DUPLANE8:
27864 case AArch64ISD::DUPLANE16:
27865 case AArch64ISD::DUPLANE32:
27866 case AArch64ISD::DUPLANE64:
27867 return performDUPCombine(N, DCI);
27868 case AArch64ISD::DUPLANE128:
27869 return performDupLane128Combine(N, DAG);
27870 case AArch64ISD::NVCAST:
27871 return performNVCASTCombine(N, DAG);
27872 case AArch64ISD::SPLICE:
27873 return performSpliceCombine(N, DAG);
27874 case AArch64ISD::UUNPKLO:
27875 case AArch64ISD::UUNPKHI:
27876 return performUnpackCombine(N, DAG, Subtarget);
27877 case AArch64ISD::UZP1:
27878 case AArch64ISD::UZP2:
27879 return performUzpCombine(N, DAG, Subtarget);
27880 case AArch64ISD::SETCC_MERGE_ZERO:
27881 return performSetccMergeZeroCombine(N, DCI);
27882 case AArch64ISD::REINTERPRET_CAST:
27884 case AArch64ISD::GLD1_MERGE_ZERO:
27885 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27886 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27887 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27888 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27889 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27890 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27891 case AArch64ISD::GLD1S_MERGE_ZERO:
27892 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
27893 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
27894 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
27895 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
27896 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
27897 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
27898 return performGLD1Combine(N, DAG);
27899 case AArch64ISD::VASHR:
27900 case AArch64ISD::VLSHR:
27901 return performVectorShiftCombine(N, *this, DCI);
27902 case AArch64ISD::SUNPKLO:
27903 return performSunpkloCombine(N, DAG);
27904 case AArch64ISD::BSP:
27905 return performBSPExpandForSVE(N, DAG, Subtarget);
27907 return performInsertVectorEltCombine(N, DCI);
27909 return performExtractVectorEltCombine(N, DCI, Subtarget);
27910 case ISD::VECREDUCE_ADD:
27911 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
27912 case ISD::GET_ACTIVE_LANE_MASK:
27913 return performActiveLaneMaskCombine(N, DCI, Subtarget);
27914 case AArch64ISD::UADDV:
27915 return performUADDVCombine(N, DAG);
27916 case AArch64ISD::SMULL:
27917 case AArch64ISD::UMULL:
27918 case AArch64ISD::PMULL:
27919 return performMULLCombine(N, DCI, DAG);
27920 case AArch64ISD::PTEST_FIRST:
27921 return performPTestFirstCombine(N, DCI, DAG);
27924 switch (N->getConstantOperandVal(1)) {
27925 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
27926 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
27927 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
27928 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
27929 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
27930 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
27931 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
27932 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
27933 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
27934 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
27935 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
27936 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
27937 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
27938 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
27939 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
27940 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
27942 case Intrinsic::aarch64_neon_ld2:
27943 case Intrinsic::aarch64_neon_ld3:
27944 case Intrinsic::aarch64_neon_ld4:
27945 case Intrinsic::aarch64_neon_ld1x2:
27946 case Intrinsic::aarch64_neon_ld1x3:
27947 case Intrinsic::aarch64_neon_ld1x4:
27948 case Intrinsic::aarch64_neon_ld2lane:
27949 case Intrinsic::aarch64_neon_ld3lane:
27950 case Intrinsic::aarch64_neon_ld4lane:
27951 case Intrinsic::aarch64_neon_ld2r:
27952 case Intrinsic::aarch64_neon_ld3r:
27953 case Intrinsic::aarch64_neon_ld4r:
27954 case Intrinsic::aarch64_neon_st2:
27955 case Intrinsic::aarch64_neon_st3:
27956 case Intrinsic::aarch64_neon_st4:
27957 case Intrinsic::aarch64_neon_st1x2:
27958 case Intrinsic::aarch64_neon_st1x3:
27959 case Intrinsic::aarch64_neon_st1x4:
27960 case Intrinsic::aarch64_neon_st2lane:
27961 case Intrinsic::aarch64_neon_st3lane:
27962 case Intrinsic::aarch64_neon_st4lane:
27963 return performNEONPostLDSTCombine(N, DCI, DAG);
27964 case Intrinsic::aarch64_sve_ldnt1:
27965 return performLDNT1Combine(N, DAG);
27966 case Intrinsic::aarch64_sve_ld1rq:
27968 case Intrinsic::aarch64_sve_ld1ro:
27970 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
27971 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27972 case Intrinsic::aarch64_sve_ldnt1_gather:
27973 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27974 case Intrinsic::aarch64_sve_ldnt1_gather_index:
27975 return performGatherLoadCombine(N, DAG,
27976 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
27977 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
27978 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27979 case Intrinsic::aarch64_sve_ld1:
27980 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
27981 case Intrinsic::aarch64_sve_ldnf1:
27982 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
27983 case Intrinsic::aarch64_sve_ldff1:
27984 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
27985 case Intrinsic::aarch64_sve_st1:
27986 return performST1Combine(N, DAG);
27987 case Intrinsic::aarch64_sve_stnt1:
27988 return performSTNT1Combine(N, DAG);
27989 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
27990 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27991 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
27992 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27993 case Intrinsic::aarch64_sve_stnt1_scatter:
27994 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27995 case Intrinsic::aarch64_sve_stnt1_scatter_index:
27996 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
27997 case Intrinsic::aarch64_sve_ld1_gather:
27998 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
27999 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
28000 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
28001 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
28002 case Intrinsic::aarch64_sve_ld1q_gather_index:
28003 return performGatherLoadCombine(N, DAG,
28004 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
28005 case Intrinsic::aarch64_sve_ld1_gather_index:
28006 return performGatherLoadCombine(N, DAG,
28007 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
28008 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
28009 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
28010 /*OnlyPackedOffsets=*/false);
28011 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
28012 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
28013 /*OnlyPackedOffsets=*/false);
28014 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
28015 return performGatherLoadCombine(N, DAG,
28016 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
28017 /*OnlyPackedOffsets=*/false);
28018 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
28019 return performGatherLoadCombine(N, DAG,
28020 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
28021 /*OnlyPackedOffsets=*/false);
28022 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
28023 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
28024 case Intrinsic::aarch64_sve_ldff1_gather:
28025 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
28026 case Intrinsic::aarch64_sve_ldff1_gather_index:
28027 return performGatherLoadCombine(N, DAG,
28028 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
28029 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
28030 return performGatherLoadCombine(N, DAG,
28031 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
28032 /*OnlyPackedOffsets=*/false);
28033 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
28034 return performGatherLoadCombine(N, DAG,
28035 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
28036 /*OnlyPackedOffsets=*/false);
28037 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
28038 return performGatherLoadCombine(N, DAG,
28039 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
28040 /*OnlyPackedOffsets=*/false);
28041 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
28042 return performGatherLoadCombine(N, DAG,
28043 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
28044 /*OnlyPackedOffsets=*/false);
28045 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
28046 return performGatherLoadCombine(N, DAG,
28047 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
28048 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
28049 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
28050 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
28051 case Intrinsic::aarch64_sve_st1q_scatter_index:
28052 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
28053 case Intrinsic::aarch64_sve_st1_scatter:
28054 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
28055 case Intrinsic::aarch64_sve_st1_scatter_index:
28056 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
28057 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
28058 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
28059 /*OnlyPackedOffsets=*/false);
28060 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
28061 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
28062 /*OnlyPackedOffsets=*/false);
28063 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
28064 return performScatterStoreCombine(N, DAG,
28065 AArch64ISD::SST1_SXTW_SCALED_PRED,
28066 /*OnlyPackedOffsets=*/false);
28067 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
28068 return performScatterStoreCombine(N, DAG,
28069 AArch64ISD::SST1_UXTW_SCALED_PRED,
28070 /*OnlyPackedOffsets=*/false);
28071 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
28072 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
28073 case Intrinsic::aarch64_rndr:
28074 case Intrinsic::aarch64_rndrrs:
28075 return performRNDRCombine(N, DAG);
28076 case Intrinsic::aarch64_sme_ldr_zt:
28077 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
28078 DAG.getVTList(MVT::Other), N->getOperand(0),
28079 N->getOperand(2), N->getOperand(3));
28080 case Intrinsic::aarch64_sme_str_zt:
28081 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
28082 DAG.getVTList(MVT::Other), N->getOperand(0),
28083 N->getOperand(2), N->getOperand(3));
28084 default:
28085 break;
28086 }
28087 break;
28088 case ISD::GlobalAddress:
28089 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
28090 case ISD::CTLZ:
28091 return performCTLZCombine(N, DAG, Subtarget);
28093 return performScalarToVectorCombine(N, DCI, DAG);
28094 case ISD::SHL:
28095 return performSHLCombine(N, DCI, DAG);
28096 }
28097 return SDValue();
28098}
28099
28100// Check if the return value is used as only a return value, as otherwise
28101// we can't perform a tail-call. In particular, we need to check for
28102// target ISD nodes that are returns and any other "odd" constructs
28103// that the generic analysis code won't necessarily catch.
28104bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
28105 SDValue &Chain) const {
28106 if (N->getNumValues() != 1)
28107 return false;
28108 if (!N->hasNUsesOfValue(1, 0))
28109 return false;
28110
28111 SDValue TCChain = Chain;
28112 SDNode *Copy = *N->user_begin();
28113 if (Copy->getOpcode() == ISD::CopyToReg) {
28114 // If the copy has a glue operand, we conservatively assume it isn't safe to
28115 // perform a tail call.
28116 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
28117 MVT::Glue)
28118 return false;
28119 TCChain = Copy->getOperand(0);
28120 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
28121 return false;
28122
28123 bool HasRet = false;
28124 for (SDNode *Node : Copy->users()) {
28125 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
28126 return false;
28127 HasRet = true;
28128 }
28129
28130 if (!HasRet)
28131 return false;
28132
28133 Chain = TCChain;
28134 return true;
28135}
28136
28137// Return whether the an instruction can potentially be optimized to a tail
28138// call. This will cause the optimizers to attempt to move, or duplicate,
28139// return instructions to help enable tail call optimizations for this
28140// instruction.
28141bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
28142 return CI->isTailCall();
28143}
28144
28145bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
28146 Register Offset, bool IsPre,
28147 MachineRegisterInfo &MRI) const {
28148 auto CstOffset = getIConstantVRegVal(Offset, MRI);
28149 if (!CstOffset || CstOffset->isZero())
28150 return false;
28151
28152 // All of the indexed addressing mode instructions take a signed 9 bit
28153 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
28154 // encodes the sign/indexing direction.
28155 return isInt<9>(CstOffset->getSExtValue());
28156}
28157
28158bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
28159 SDValue &Base,
28160 SDValue &Offset,
28161 SelectionDAG &DAG) const {
28162 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
28163 return false;
28164
28165 // Non-null if there is exactly one user of the loaded value (ignoring chain).
28166 SDNode *ValOnlyUser = nullptr;
28167 for (SDUse &U : N->uses()) {
28168 if (U.getResNo() == 1)
28169 continue; // Ignore chain.
28170 if (ValOnlyUser == nullptr)
28171 ValOnlyUser = U.getUser();
28172 else {
28173 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
28174 break;
28175 }
28176 }
28177
28178 auto IsUndefOrZero = [](SDValue V) {
28179 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
28180 };
28181
28182 // If the only user of the value is a scalable vector splat, it is
28183 // preferable to do a replicating load (ld1r*).
28184 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
28185 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
28186 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
28187 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
28188 return false;
28189
28190 Base = Op->getOperand(0);
28191 // All of the indexed addressing mode instructions take a signed
28192 // 9 bit immediate offset.
28193 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
28194 int64_t RHSC = RHS->getSExtValue();
28195 if (Op->getOpcode() == ISD::SUB)
28196 RHSC = -(uint64_t)RHSC;
28197 if (!isInt<9>(RHSC))
28198 return false;
28199 // When big-endian VLD1/VST1 are used for vector load and store, and these
28200 // only allow an offset that's equal to the store size.
28201 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
28202 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
28203 (uint64_t)RHSC != MemType.getStoreSize())
28204 return false;
28205 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
28206 // when dealing with subtraction.
28207 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
28208 return true;
28209 }
28210 return false;
28211}
28212
28213bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
28214 SDValue &Offset,
28216 SelectionDAG &DAG) const {
28217 EVT VT;
28218 SDValue Ptr;
28219 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28220 VT = LD->getMemoryVT();
28221 Ptr = LD->getBasePtr();
28222 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28223 VT = ST->getMemoryVT();
28224 Ptr = ST->getBasePtr();
28225 } else
28226 return false;
28227
28228 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
28229 return false;
28230 AM = ISD::PRE_INC;
28231 return true;
28232}
28233
28234bool AArch64TargetLowering::getPostIndexedAddressParts(
28236 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
28237 EVT VT;
28238 SDValue Ptr;
28239 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28240 VT = LD->getMemoryVT();
28241 Ptr = LD->getBasePtr();
28242 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28243 VT = ST->getMemoryVT();
28244 Ptr = ST->getBasePtr();
28245 } else
28246 return false;
28247
28248 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
28249 return false;
28250 // Post-indexing updates the base, so it's not a valid transform
28251 // if that's not the same as the load's pointer.
28252 if (Ptr != Base)
28253 return false;
28254 AM = ISD::POST_INC;
28255 return true;
28256}
28257
28260 SelectionDAG &DAG) {
28261 SDLoc DL(N);
28262 SDValue Op = N->getOperand(0);
28263 EVT VT = N->getValueType(0);
28264 [[maybe_unused]] EVT SrcVT = Op.getValueType();
28265 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28266 "Must be bool vector.");
28267
28268 // Special handling for Clang's __builtin_convertvector. For vectors with <8
28269 // elements, it adds a vector concatenation with undef(s). If we encounter
28270 // this here, we can skip the concat.
28271 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
28272 bool AllUndef = true;
28273 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
28274 AllUndef &= Op.getOperand(I).isUndef();
28275
28276 if (AllUndef)
28277 Op = Op.getOperand(0);
28278 }
28279
28280 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
28281 if (VectorBits)
28282 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
28283}
28284
28287 SelectionDAG &DAG, EVT ExtendVT,
28288 EVT CastVT) {
28289 SDLoc DL(N);
28290 SDValue Op = N->getOperand(0);
28291 EVT VT = N->getValueType(0);
28292
28293 // Use SCALAR_TO_VECTOR for lane zero
28294 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
28295 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
28296 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
28297 Results.push_back(
28298 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
28299}
28300
28301void AArch64TargetLowering::ReplaceBITCASTResults(
28303 SDLoc DL(N);
28304 SDValue Op = N->getOperand(0);
28305 EVT VT = N->getValueType(0);
28306 EVT SrcVT = Op.getValueType();
28307
28308 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
28309 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
28310 return;
28311 }
28312
28313 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
28314 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
28315 return;
28316 }
28317
28318 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
28319 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
28320 return;
28321 }
28322
28323 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
28324 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
28325 "Expected fp->int bitcast!");
28326
28327 // Bitcasting between unpacked vector types of different element counts is
28328 // not a NOP because the live elements are laid out differently.
28329 // 01234567
28330 // e.g. nxv2i32 = XX??XX??
28331 // nxv4f16 = X?X?X?X?
28332 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
28333 return;
28334
28335 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
28336 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
28337 return;
28338 }
28339
28340 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28341 !VT.isVector())
28342 return replaceBoolVectorBitcast(N, Results, DAG);
28343
28344 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
28345 return;
28346
28347 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
28348 DAG.getUNDEF(MVT::i32), Op);
28349 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
28350 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
28351}
28352
28354 SelectionDAG &DAG,
28355 const AArch64Subtarget *Subtarget) {
28356 EVT VT = N->getValueType(0);
28357 if (!VT.is256BitVector() ||
28359 !N->getFlags().hasAllowReassociation()) ||
28360 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
28361 VT.getScalarType() == MVT::bf16)
28362 return;
28363
28364 SDValue X = N->getOperand(0);
28365 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
28366 if (!Shuf) {
28367 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
28368 X = N->getOperand(1);
28369 if (!Shuf)
28370 return;
28371 }
28372
28373 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
28374 return;
28375
28376 // Check the mask is 1,0,3,2,5,4,...
28377 ArrayRef<int> Mask = Shuf->getMask();
28378 for (int I = 0, E = Mask.size(); I < E; I++)
28379 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
28380 return;
28381
28382 SDLoc DL(N);
28383 auto LoHi = DAG.SplitVector(X, DL);
28384 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
28385 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
28386 LoHi.first, LoHi.second);
28387
28388 // Shuffle the elements back into order.
28389 SmallVector<int> NMask;
28390 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
28391 NMask.push_back(I);
28392 NMask.push_back(I);
28393 }
28394 Results.push_back(
28395 DAG.getVectorShuffle(VT, DL,
28396 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
28397 DAG.getUNDEF(LoHi.first.getValueType())),
28398 DAG.getUNDEF(VT), NMask));
28399}
28400
28403 SelectionDAG &DAG, unsigned InterOp,
28404 unsigned AcrossOp) {
28405 EVT LoVT, HiVT;
28406 SDValue Lo, Hi;
28407 SDLoc DL(N);
28408 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
28409 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28410 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
28411 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
28412 Results.push_back(SplitVal);
28413}
28414
28415void AArch64TargetLowering::ReplaceExtractSubVectorResults(
28417 SDValue In = N->getOperand(0);
28418 EVT InVT = In.getValueType();
28419
28420 // Common code will handle these just fine.
28421 if (!InVT.isScalableVector() || !InVT.isInteger())
28422 return;
28423
28424 SDLoc DL(N);
28425 EVT VT = N->getValueType(0);
28426
28427 // The following checks bail if this is not a halving operation.
28428
28429 ElementCount ResEC = VT.getVectorElementCount();
28430
28431 if (InVT.getVectorElementCount() != (ResEC * 2))
28432 return;
28433
28434 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
28435 if (!CIndex)
28436 return;
28437
28438 unsigned Index = CIndex->getZExtValue();
28439 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
28440 return;
28441
28442 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
28443 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28444
28445 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
28446 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
28447}
28448
28449void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
28451 assert((Subtarget->hasSVE2p1() ||
28452 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
28453 "Custom lower of get.active.lane.mask missing required feature.");
28454
28455 assert(N->getValueType(0) == MVT::nxv32i1 &&
28456 "Unexpected result type for get.active.lane.mask");
28457
28458 SDLoc DL(N);
28459 SDValue Idx = N->getOperand(0);
28460 SDValue TC = N->getOperand(1);
28461
28462 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
28463 "Unexpected operand type for get.active.lane.mask");
28464
28465 if (Idx.getValueType() != MVT::i64) {
28466 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
28467 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
28468 }
28469
28470 SDValue ID =
28471 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
28472 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
28473 auto WideMask =
28474 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
28475
28476 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
28477 {WideMask.getValue(0), WideMask.getValue(1)}));
28478}
28479
28480// Create an even/odd pair of X registers holding integer value V.
28482 SDLoc DL(V.getNode());
28483 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
28484 if (DAG.getDataLayout().isBigEndian())
28485 std::swap (VLo, VHi);
28486 SDValue RegClass =
28487 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
28488 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
28489 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
28490 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
28491 return SDValue(
28492 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
28493}
28494
28497 SelectionDAG &DAG,
28498 const AArch64Subtarget *Subtarget) {
28499 assert(N->getValueType(0) == MVT::i128 &&
28500 "AtomicCmpSwap on types less than 128 should be legal");
28501
28502 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28503 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
28504 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
28505 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
28506 SDValue Ops[] = {
28507 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
28508 createGPRPairNode(DAG, N->getOperand(3)), // Store value
28509 N->getOperand(1), // Ptr
28510 N->getOperand(0), // Chain in
28511 };
28512
28513 unsigned Opcode;
28514 switch (MemOp->getMergedOrdering()) {
28516 Opcode = AArch64::CASPX;
28517 break;
28519 Opcode = AArch64::CASPAX;
28520 break;
28522 Opcode = AArch64::CASPLX;
28523 break;
28526 Opcode = AArch64::CASPALX;
28527 break;
28528 default:
28529 llvm_unreachable("Unexpected ordering!");
28530 }
28531
28532 MachineSDNode *CmpSwap = DAG.getMachineNode(
28533 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
28534 DAG.setNodeMemRefs(CmpSwap, {MemOp});
28535
28536 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
28537 if (DAG.getDataLayout().isBigEndian())
28538 std::swap(SubReg1, SubReg2);
28539 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
28540 SDValue(CmpSwap, 0));
28541 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
28542 SDValue(CmpSwap, 0));
28543 Results.push_back(
28544 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28545 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
28546 return;
28547 }
28548
28549 unsigned Opcode;
28550 switch (MemOp->getMergedOrdering()) {
28552 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
28553 break;
28555 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
28556 break;
28558 Opcode = AArch64::CMP_SWAP_128_RELEASE;
28559 break;
28562 Opcode = AArch64::CMP_SWAP_128;
28563 break;
28564 default:
28565 llvm_unreachable("Unexpected ordering!");
28566 }
28567
28568 SDLoc DL(N);
28569 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
28570 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
28571 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
28572 New.first, New.second, N->getOperand(0)};
28573 SDNode *CmpSwap = DAG.getMachineNode(
28574 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
28575 Ops);
28576 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
28577
28578 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28579 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
28580 Results.push_back(SDValue(CmpSwap, 3));
28581}
28582
28583static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
28584 AtomicOrdering Ordering) {
28585 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
28586 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
28587 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
28588 // ATOMIC_LOAD_CLR at any point.
28589 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
28590 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
28591 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
28592 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
28593
28594 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28595 // The operand will need to be XORed in a separate step.
28596 switch (Ordering) {
28598 return AArch64::LDCLRP;
28599 break;
28601 return AArch64::LDCLRPA;
28602 break;
28604 return AArch64::LDCLRPL;
28605 break;
28608 return AArch64::LDCLRPAL;
28609 break;
28610 default:
28611 llvm_unreachable("Unexpected ordering!");
28612 }
28613 }
28614
28615 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
28616 switch (Ordering) {
28618 return AArch64::LDSETP;
28619 break;
28621 return AArch64::LDSETPA;
28622 break;
28624 return AArch64::LDSETPL;
28625 break;
28628 return AArch64::LDSETPAL;
28629 break;
28630 default:
28631 llvm_unreachable("Unexpected ordering!");
28632 }
28633 }
28634
28635 if (ISDOpcode == ISD::ATOMIC_SWAP) {
28636 switch (Ordering) {
28638 return AArch64::SWPP;
28639 break;
28641 return AArch64::SWPPA;
28642 break;
28644 return AArch64::SWPPL;
28645 break;
28648 return AArch64::SWPPAL;
28649 break;
28650 default:
28651 llvm_unreachable("Unexpected ordering!");
28652 }
28653 }
28654
28655 llvm_unreachable("Unexpected ISDOpcode!");
28656}
28657
28660 SelectionDAG &DAG,
28661 const AArch64Subtarget *Subtarget) {
28662 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
28663 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
28664 // rather than the CASP instructions, because CASP has register classes for
28665 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
28666 // to present them as single operands. LSE128 instructions use the GPR64
28667 // register class (because the pair does not have to be sequential), like
28668 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
28669
28670 assert(N->getValueType(0) == MVT::i128 &&
28671 "AtomicLoadXXX on types less than 128 should be legal");
28672
28673 if (!Subtarget->hasLSE128())
28674 return;
28675
28676 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28677 const SDValue &Chain = N->getOperand(0);
28678 const SDValue &Ptr = N->getOperand(1);
28679 const SDValue &Val128 = N->getOperand(2);
28680 std::pair<SDValue, SDValue> Val2x64 =
28681 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
28682
28683 const unsigned ISDOpcode = N->getOpcode();
28684 const unsigned MachineOpcode =
28685 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
28686
28687 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28688 SDLoc DL(Val128);
28689 Val2x64.first =
28690 DAG.getNode(ISD::XOR, DL, MVT::i64,
28691 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
28692 Val2x64.second =
28693 DAG.getNode(ISD::XOR, DL, MVT::i64,
28694 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
28695 }
28696
28697 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
28698 if (DAG.getDataLayout().isBigEndian())
28699 std::swap(Ops[0], Ops[1]);
28700
28701 MachineSDNode *AtomicInst =
28702 DAG.getMachineNode(MachineOpcode, SDLoc(N),
28703 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
28704
28705 DAG.setNodeMemRefs(AtomicInst, {MemOp});
28706
28707 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
28708 if (DAG.getDataLayout().isBigEndian())
28709 std::swap(Lo, Hi);
28710
28711 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28712 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
28713}
28714
28715void AArch64TargetLowering::ReplaceNodeResults(
28717 switch (N->getOpcode()) {
28718 default:
28719 llvm_unreachable("Don't know how to custom expand this");
28720 case ISD::BITCAST:
28721 ReplaceBITCASTResults(N, Results, DAG);
28722 return;
28723 case ISD::VECREDUCE_ADD:
28724 case ISD::VECREDUCE_SMAX:
28725 case ISD::VECREDUCE_SMIN:
28726 case ISD::VECREDUCE_UMAX:
28727 case ISD::VECREDUCE_UMIN:
28728 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
28729 return;
28731 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
28732 Results.push_back(Res);
28733 return;
28734 case ISD::ADD:
28735 case ISD::FADD:
28736 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
28737 return;
28738
28739 case ISD::CTPOP:
28740 case ISD::PARITY:
28741 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
28742 Results.push_back(Result);
28743 return;
28744 case AArch64ISD::SADDV:
28745 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
28746 return;
28747 case AArch64ISD::UADDV:
28748 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
28749 return;
28750 case AArch64ISD::SMINV:
28751 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
28752 return;
28753 case AArch64ISD::UMINV:
28754 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
28755 return;
28756 case AArch64ISD::SMAXV:
28757 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
28758 return;
28759 case AArch64ISD::UMAXV:
28760 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
28761 return;
28762 case ISD::MULHS:
28764 Results.push_back(
28765 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
28766 return;
28767 case ISD::MULHU:
28769 Results.push_back(
28770 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
28771 return;
28772 case ISD::FP_TO_UINT:
28773 case ISD::FP_TO_SINT:
28776 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
28777 // Let normal code take care of it by not adding anything to Results.
28778 return;
28779 case ISD::ATOMIC_CMP_SWAP:
28780 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
28781 return;
28782 case ISD::ATOMIC_LOAD_CLR:
28783 assert(N->getValueType(0) != MVT::i128 &&
28784 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
28785 break;
28786 case ISD::ATOMIC_LOAD_AND:
28787 case ISD::ATOMIC_LOAD_OR:
28788 case ISD::ATOMIC_SWAP: {
28789 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
28790 "Expected 128-bit atomicrmw.");
28791 // These need custom type legalisation so we go directly to instruction.
28792 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
28793 return;
28794 }
28795 case ISD::ADDRSPACECAST: {
28796 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
28797 Results.push_back(V);
28798 return;
28799 }
28800 case ISD::ATOMIC_LOAD:
28801 case ISD::LOAD: {
28802 MemSDNode *LoadNode = cast<MemSDNode>(N);
28803 EVT MemVT = LoadNode->getMemoryVT();
28804 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
28805 // targets.
28806 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
28807 MemVT.getSizeInBits() == 256u &&
28808 (MemVT.getScalarSizeInBits() == 8u ||
28809 MemVT.getScalarSizeInBits() == 16u ||
28810 MemVT.getScalarSizeInBits() == 32u ||
28811 MemVT.getScalarSizeInBits() == 64u)) {
28812
28813 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
28815 AArch64ISD::LDNP, SDLoc(N),
28816 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
28817 {LoadNode->getChain(), LoadNode->getBasePtr()},
28818 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28819
28820 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
28821 DAG.getBitcast(HalfVT, Result.getValue(0)),
28822 DAG.getBitcast(HalfVT, Result.getValue(1)));
28823 Results.append({Pair, Result.getValue(2) /* Chain */});
28824 return;
28825 }
28826
28827 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
28828 LoadNode->getMemoryVT() != MVT::i128) {
28829 // Non-volatile or atomic loads are optimized later in AArch64's load/store
28830 // optimizer.
28831 return;
28832 }
28833
28834 if (SDValue(N, 0).getValueType() == MVT::i128) {
28835 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
28836 bool isLoadAcquire =
28838 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
28839
28840 if (isLoadAcquire)
28841 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
28842
28844 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28845 {LoadNode->getChain(), LoadNode->getBasePtr()},
28846 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28847
28848 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
28849
28850 SDValue Pair =
28851 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
28852 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
28853 Results.append({Pair, Result.getValue(2) /* Chain */});
28854 }
28855 return;
28856 }
28858 ReplaceExtractSubVectorResults(N, Results, DAG);
28859 return;
28862 // Custom lowering has been requested for INSERT_SUBVECTOR and
28863 // CONCAT_VECTORS -- but delegate to common code for result type
28864 // legalisation
28865 return;
28866 case ISD::GET_ACTIVE_LANE_MASK:
28867 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
28868 return;
28870 EVT VT = N->getValueType(0);
28871
28872 Intrinsic::ID IntID =
28873 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
28874 switch (IntID) {
28875 default:
28876 return;
28877 case Intrinsic::aarch64_sve_clasta_n: {
28878 assert((VT == MVT::i8 || VT == MVT::i16) &&
28879 "custom lowering for unexpected type");
28880 SDLoc DL(N);
28881 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28882 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
28883 N->getOperand(1), Op2, N->getOperand(3));
28884 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28885 return;
28886 }
28887 case Intrinsic::aarch64_sve_clastb_n: {
28888 assert((VT == MVT::i8 || VT == MVT::i16) &&
28889 "custom lowering for unexpected type");
28890 SDLoc DL(N);
28891 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28892 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
28893 N->getOperand(1), Op2, N->getOperand(3));
28894 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28895 return;
28896 }
28897 case Intrinsic::aarch64_sve_lasta: {
28898 assert((VT == MVT::i8 || VT == MVT::i16) &&
28899 "custom lowering for unexpected type");
28900 SDLoc DL(N);
28901 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
28902 N->getOperand(1), N->getOperand(2));
28903 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28904 return;
28905 }
28906 case Intrinsic::aarch64_sve_lastb: {
28907 assert((VT == MVT::i8 || VT == MVT::i16) &&
28908 "custom lowering for unexpected type");
28909 SDLoc DL(N);
28910 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
28911 N->getOperand(1), N->getOperand(2));
28912 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28913 return;
28914 }
28915 case Intrinsic::aarch64_sme_in_streaming_mode: {
28916 SDLoc DL(N);
28917 SDValue Chain = DAG.getEntryNode();
28918
28919 SDValue RuntimePStateSM =
28920 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
28921 Results.push_back(
28922 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
28923 return;
28924 }
28925 case Intrinsic::experimental_vector_match: {
28926 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
28927 return;
28928
28929 // NOTE: Only trivial type promotion is supported.
28930 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
28931 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
28932 return;
28933
28934 SDLoc DL(N);
28935 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
28936 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28937 return;
28938 }
28939 }
28940 }
28941 case ISD::READ_REGISTER: {
28942 SDLoc DL(N);
28943 assert(N->getValueType(0) == MVT::i128 &&
28944 "READ_REGISTER custom lowering is only for 128-bit sysregs");
28945 SDValue Chain = N->getOperand(0);
28946 SDValue SysRegName = N->getOperand(1);
28947
28948 SDValue Result = DAG.getNode(
28949 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28950 Chain, SysRegName);
28951
28952 // Sysregs are not endian. Result.getValue(0) always contains the lower half
28953 // of the 128-bit System Register value.
28954 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28955 Result.getValue(0), Result.getValue(1));
28956 Results.push_back(Pair);
28957 Results.push_back(Result.getValue(2)); // Chain
28958 return;
28959 }
28960 }
28961}
28962
28964 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
28966 return true;
28967}
28968
28970 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
28971 // reciprocal if there are three or more FDIVs.
28972 return 3;
28973}
28974
28977 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
28978 // v4i16, v2i32 instead of to promote.
28979 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
28980 VT == MVT::v1f32)
28981 return TypeWidenVector;
28982
28984}
28985
28986// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
28987// provided the address is 16-byte aligned.
28989 if (!Subtarget->hasLSE2())
28990 return false;
28991
28992 if (auto LI = dyn_cast<LoadInst>(I))
28993 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28994 LI->getAlign() >= Align(16);
28995
28996 if (auto SI = dyn_cast<StoreInst>(I))
28997 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28998 SI->getAlign() >= Align(16);
28999
29000 return false;
29001}
29002
29004 if (!Subtarget->hasLSE128())
29005 return false;
29006
29007 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
29008 // will clobber the two registers.
29009 if (const auto *SI = dyn_cast<StoreInst>(I))
29010 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29011 SI->getAlign() >= Align(16) &&
29012 (SI->getOrdering() == AtomicOrdering::Release ||
29013 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
29014
29015 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
29016 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29017 RMW->getAlign() >= Align(16) &&
29018 (RMW->getOperation() == AtomicRMWInst::Xchg ||
29019 RMW->getOperation() == AtomicRMWInst::And ||
29020 RMW->getOperation() == AtomicRMWInst::Or);
29021
29022 return false;
29023}
29024
29026 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
29027 return false;
29028
29029 if (auto LI = dyn_cast<LoadInst>(I))
29030 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
29031 LI->getAlign() >= Align(16) &&
29032 LI->getOrdering() == AtomicOrdering::Acquire;
29033
29034 if (auto SI = dyn_cast<StoreInst>(I))
29035 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29036 SI->getAlign() >= Align(16) &&
29037 SI->getOrdering() == AtomicOrdering::Release;
29038
29039 return false;
29040}
29041
29043 const Instruction *I) const {
29045 return false;
29047 return false;
29049 return true;
29050 return false;
29051}
29052
29054 const Instruction *I) const {
29055 // Store-Release instructions only provide seq_cst guarantees when paired with
29056 // Load-Acquire instructions. MSVC CRT does not use these instructions to
29057 // implement seq_cst loads and stores, so we need additional explicit fences
29058 // after memory writes.
29059 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
29060 return false;
29061
29062 switch (I->getOpcode()) {
29063 default:
29064 return false;
29065 case Instruction::AtomicCmpXchg:
29066 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
29068 case Instruction::AtomicRMW:
29069 return cast<AtomicRMWInst>(I)->getOrdering() ==
29071 case Instruction::Store:
29072 return cast<StoreInst>(I)->getOrdering() ==
29074 }
29075}
29076
29077// Loads and stores less than 128-bits are already atomic; ones above that
29078// are doomed anyway, so defer to the default libcall and blame the OS when
29079// things go wrong.
29082 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
29083 if (Size != 128)
29092}
29093
29094// Loads and stores less than 128-bits are already atomic; ones above that
29095// are doomed anyway, so defer to the default libcall and blame the OS when
29096// things go wrong.
29099 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
29100
29101 if (Size != 128)
29103 if (isOpSuitableForRCPC3(LI))
29105 // No LSE128 loads
29106 if (isOpSuitableForLDPSTP(LI))
29108
29109 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29110 // implement atomicrmw without spilling. If the target address is also on the
29111 // stack and close enough to the spill slot, this can lead to a situation
29112 // where the monitor always gets cleared and the atomic operation can never
29113 // succeed. So at -O0 lower this operation to a CAS loop.
29114 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29116
29117 // Using CAS for an atomic load has a better chance of succeeding under high
29118 // contention situations. So use it if available.
29119 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
29121}
29122
29123// Return true if the atomic operation expansion will lower to use a library
29124// call, and is thus ineligible to use an LLSC expansion.
29125static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
29126 const AtomicRMWInst *RMW) {
29127 if (!RMW->isFloatingPointOperation())
29128 return false;
29129 switch (RMW->getType()->getScalarType()->getTypeID()) {
29130 case Type::FloatTyID:
29131 case Type::DoubleTyID:
29132 case Type::HalfTyID:
29133 case Type::BFloatTyID:
29134 // Will use soft float
29135 return !Subtarget.hasFPARMv8();
29136 default:
29137 // fp128 will emit library calls.
29138 return true;
29139 }
29140
29141 llvm_unreachable("covered type switch");
29142}
29143
29144// The "default" for integer RMW operations is to expand to an LL/SC loop.
29145// However, with the LSE instructions (or outline-atomics mode, which provides
29146// library routines in place of the LSE-instructions), we can directly emit many
29147// operations instead.
29150 Type *Ty = AI->getType();
29151 unsigned Size = Ty->getPrimitiveSizeInBits();
29152 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
29153
29154 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
29158 if (CanUseLSE128)
29160
29161 // If LSFE available, use atomic FP instructions in preference to expansion
29162 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
29168
29169 // Nand is not supported in LSE.
29170 // Leave 128 bits to LLSC or CmpXChg.
29171 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
29172 !AI->isFloatingPointOperation()) {
29173 if (Subtarget->hasLSE())
29175 if (Subtarget->outlineAtomics()) {
29176 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
29177 // Don't outline them unless
29178 // (1) high level <atomic> support approved:
29179 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
29180 // (2) low level libgcc and compiler-rt support implemented by:
29181 // min/max outline atomics helpers
29182 if (AI->getOperation() != AtomicRMWInst::Min &&
29187 }
29188 }
29189 }
29190
29191 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29192 // implement atomicrmw without spilling. If the target address is also on the
29193 // stack and close enough to the spill slot, this can lead to a situation
29194 // where the monitor always gets cleared and the atomic operation can never
29195 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
29196 // we have a single CAS instruction that can replace the loop.
29197 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
29198 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
29200
29202}
29203
29206 AtomicCmpXchgInst *AI) const {
29207 // If subtarget has LSE, leave cmpxchg intact for codegen.
29208 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
29210 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29211 // implement cmpxchg without spilling. If the address being exchanged is also
29212 // on the stack and close enough to the spill slot, this can lead to a
29213 // situation where the monitor always gets cleared and the atomic operation
29214 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
29215 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29217
29218 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
29219 // it.
29221 if (Size > 64)
29223
29225}
29226
29228 Type *ValueTy, Value *Addr,
29229 AtomicOrdering Ord) const {
29230 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29231 bool IsAcquire = isAcquireOrStronger(Ord);
29232
29233 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
29234 // intrinsic must return {i64, i64} and we have to recombine them into a
29235 // single i128 here.
29236 if (ValueTy->getPrimitiveSizeInBits() == 128) {
29238 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
29239
29240 Value *LoHi =
29241 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
29242
29243 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
29244 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
29245
29246 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
29247 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
29248 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
29249
29250 Value *Or = Builder.CreateOr(
29251 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
29252 return Builder.CreateBitCast(Or, ValueTy);
29253 }
29254
29255 Type *Tys[] = { Addr->getType() };
29257 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
29258
29259 const DataLayout &DL = M->getDataLayout();
29260 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
29261 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
29262 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
29263 Attribute::ElementType, IntEltTy));
29264 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
29265
29266 return Builder.CreateBitCast(Trunc, ValueTy);
29267}
29268
29270 IRBuilderBase &Builder) const {
29271 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
29272}
29273
29275 Value *Val, Value *Addr,
29276 AtomicOrdering Ord) const {
29277 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29278 bool IsRelease = isReleaseOrStronger(Ord);
29279
29280 // Since the intrinsics must have legal type, the i128 intrinsics take two
29281 // parameters: "i64, i64". We must marshal Val into the appropriate form
29282 // before the call.
29283 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
29285 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
29287 Type *Int64Ty = Type::getInt64Ty(M->getContext());
29288 Type *Int128Ty = Type::getInt128Ty(M->getContext());
29289
29290 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
29291
29292 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
29293 Value *Hi =
29294 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
29295 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
29296 }
29297
29299 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
29300 Type *Tys[] = { Addr->getType() };
29302
29303 const DataLayout &DL = M->getDataLayout();
29304 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
29305 Val = Builder.CreateBitCast(Val, IntValTy);
29306
29307 CallInst *CI = Builder.CreateCall(
29308 Stxr, {Builder.CreateZExtOrBitCast(
29309 Val, Stxr->getFunctionType()->getParamType(0)),
29310 Addr});
29311 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
29312 Attribute::ElementType, Val->getType()));
29313 return CI;
29314}
29315
29317 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
29318 const DataLayout &DL) const {
29319 if (!Ty->isArrayTy()) {
29320 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
29321 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
29322 }
29323
29324 // All non aggregate members of the type must have the same type
29325 SmallVector<EVT> ValueVTs;
29326 ComputeValueVTs(*this, DL, Ty, ValueVTs);
29327 return all_equal(ValueVTs);
29328}
29329
29330bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
29331 EVT) const {
29332 return false;
29333}
29334
29335static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
29336 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
29337 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
29338 M, Intrinsic::thread_pointer, IRB.getPtrTy());
29339 return IRB.CreatePointerCast(
29340 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
29341 Offset),
29342 IRB.getPtrTy(0));
29343}
29344
29346 // Android provides a fixed TLS slot for the stack cookie. See the definition
29347 // of TLS_SLOT_STACK_GUARD in
29348 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
29349 if (Subtarget->isTargetAndroid())
29350 return UseTlsOffset(IRB, 0x28);
29351
29352 // Fuchsia is similar.
29353 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
29354 if (Subtarget->isTargetFuchsia())
29355 return UseTlsOffset(IRB, -0x10);
29356
29358}
29359
29361 // MSVC CRT provides functionalities for stack protection.
29362 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29363 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29364
29365 RTLIB::LibcallImpl SecurityCookieVar =
29366 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
29367 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
29368 SecurityCookieVar != RTLIB::Unsupported) {
29369 // MSVC CRT has a global variable holding security cookie.
29370 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
29371 PointerType::getUnqual(M.getContext()));
29372
29373 // MSVC CRT has a function to validate security cookie.
29374 FunctionCallee SecurityCheckCookie =
29375 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
29376 Type::getVoidTy(M.getContext()),
29377 PointerType::getUnqual(M.getContext()));
29378 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
29379 F->setCallingConv(CallingConv::Win64);
29380 F->addParamAttr(0, Attribute::AttrKind::InReg);
29381 }
29382 return;
29383 }
29385}
29386
29388 // MSVC CRT has a function to validate security cookie.
29389 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29390 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29391 if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
29392 return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
29394}
29395
29396Value *
29398 // Android provides a fixed TLS slot for the SafeStack pointer. See the
29399 // definition of TLS_SLOT_SAFESTACK in
29400 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
29401 if (Subtarget->isTargetAndroid())
29402 return UseTlsOffset(IRB, 0x48);
29403
29404 // Fuchsia is similar.
29405 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
29406 if (Subtarget->isTargetFuchsia())
29407 return UseTlsOffset(IRB, -0x8);
29408
29410}
29411
29412/// If a physical register, this returns the register that receives the
29413/// exception address on entry to an EH pad.
29415 const Constant *PersonalityFn) const {
29416 // FIXME: This is a guess. Has this been defined yet?
29417 return AArch64::X0;
29418}
29419
29420/// If a physical register, this returns the register that receives the
29421/// exception typeid on entry to a landing pad.
29423 const Constant *PersonalityFn) const {
29424 // FIXME: This is a guess. Has this been defined yet?
29425 return AArch64::X1;
29426}
29427
29429 const Instruction &AndI) const {
29430 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
29431 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
29432 // may be beneficial to sink in other cases, but we would have to check that
29433 // the cmp would not get folded into the br to form a cbz for these to be
29434 // beneficial.
29436 if (!Mask)
29437 return false;
29438 return Mask->getValue().isPowerOf2();
29439}
29440
29444 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
29445 SelectionDAG &DAG) const {
29446 // Does baseline recommend not to perform the fold by default?
29448 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
29449 return false;
29450 // Else, if this is a vector shift, prefer 'shl'.
29451 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
29452}
29453
29456 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
29458 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
29461 ExpansionFactor);
29462}
29463
29465 // Update IsSplitCSR in AArch64unctionInfo.
29466 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
29467 AFI->setIsSplitCSR(true);
29468}
29469
29471 MachineBasicBlock *Entry,
29472 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
29473 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
29474 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
29475 if (!IStart)
29476 return;
29477
29478 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
29479 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
29480 MachineBasicBlock::iterator MBBI = Entry->begin();
29481 for (const MCPhysReg *I = IStart; *I; ++I) {
29482 const TargetRegisterClass *RC = nullptr;
29483 if (AArch64::GPR64RegClass.contains(*I))
29484 RC = &AArch64::GPR64RegClass;
29485 else if (AArch64::FPR64RegClass.contains(*I))
29486 RC = &AArch64::FPR64RegClass;
29487 else
29488 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
29489
29490 Register NewVR = MRI->createVirtualRegister(RC);
29491 // Create copy from CSR to a virtual register.
29492 // FIXME: this currently does not emit CFI pseudo-instructions, it works
29493 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
29494 // nounwind. If we want to generalize this later, we may need to emit
29495 // CFI pseudo-instructions.
29496 assert(Entry->getParent()->getFunction().hasFnAttribute(
29497 Attribute::NoUnwind) &&
29498 "Function should be nounwind in insertCopiesSplitCSR!");
29499 Entry->addLiveIn(*I);
29500 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
29501 .addReg(*I);
29502
29503 // Insert the copy-back instructions right before the terminator.
29504 for (auto *Exit : Exits)
29505 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
29506 TII->get(TargetOpcode::COPY), *I)
29507 .addReg(NewVR);
29508 }
29509}
29510
29511bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
29512 // Integer division on AArch64 is expensive. However, when aggressively
29513 // optimizing for code size, we prefer to use a div instruction, as it is
29514 // usually smaller than the alternative sequence.
29515 // The exception to this is vector division. Since AArch64 doesn't have vector
29516 // integer division, leaving the division as-is is a loss even in terms of
29517 // size, because it will have to be scalarized, while the alternative code
29518 // sequence can be performed in vector form.
29519 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
29520 return OptSize && !VT.isVector();
29521}
29522
29524 const MachineFunction &MF) const {
29525 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
29526 // In future, we could allow this when SVE is available, but currently,
29527 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
29528 // the general lowering may introduce stack spills/reloads).
29529 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
29530 return false;
29531
29532 // Do not merge to float value size (128 bytes) if no implicit float attribute
29533 // is set.
29534 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
29535 return !NoFloat || MemVT.getSizeInBits() <= 64;
29536}
29537
29539 // We want inc-of-add for scalars and sub-of-not for vectors.
29540 return VT.isScalarInteger();
29541}
29542
29544 EVT VT) const {
29545 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
29546 // legalize.
29547 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
29548 return false;
29549 if (FPVT == MVT::v8bf16)
29550 return false;
29551 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
29552}
29553
29555 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
29556 // avoid vselect becoming bsl / unrolling.
29557 return !VT.isFixedLengthVector();
29558}
29559
29563 const TargetInstrInfo *TII) const {
29564 assert(MBBI->isCall() && MBBI->getCFIType() &&
29565 "Invalid call instruction for a KCFI check");
29566
29567 switch (MBBI->getOpcode()) {
29568 case AArch64::BLR:
29569 case AArch64::BLRNoIP:
29570 case AArch64::TCRETURNri:
29571 case AArch64::TCRETURNrix16x17:
29572 case AArch64::TCRETURNrix17:
29573 case AArch64::TCRETURNrinotx16:
29574 break;
29575 default:
29576 llvm_unreachable("Unexpected CFI call opcode");
29577 }
29578
29579 MachineOperand &Target = MBBI->getOperand(0);
29580 assert(Target.isReg() && "Invalid target operand for an indirect call");
29581 Target.setIsRenamable(false);
29582
29583 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
29584 .addReg(Target.getReg())
29585 .addImm(MBBI->getCFIType())
29586 .getInstr();
29587}
29588
29590 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
29591}
29592
29593unsigned
29595 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
29596 return getPointerTy(DL).getSizeInBits();
29597
29598 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
29599}
29600
29601void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
29602 MachineFrameInfo &MFI = MF.getFrameInfo();
29603 // If we have any vulnerable SVE stack objects then the stack protector
29604 // needs to be placed at the top of the SVE stack area, as the SVE locals
29605 // are placed above the other locals, so we allocate it as if it were a
29606 // scalable vector.
29607 // FIXME: It may be worthwhile having a specific interface for this rather
29608 // than doing it here in finalizeLowering.
29609 if (MFI.hasStackProtectorIndex()) {
29610 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
29611 if (MFI.hasScalableStackID(i) &&
29616 break;
29617 }
29618 }
29619 }
29622}
29623
29624// Unlike X86, we let frame lowering assign offsets to all catch objects.
29626
29627bool AArch64TargetLowering::shouldLocalize(
29628 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
29629 auto &MF = *MI.getMF();
29630 auto &MRI = MF.getRegInfo();
29631 auto maxUses = [](unsigned RematCost) {
29632 // A cost of 1 means remats are basically free.
29633 if (RematCost == 1)
29634 return std::numeric_limits<unsigned>::max();
29635 if (RematCost == 2)
29636 return 2U;
29637
29638 // Remat is too expensive, only sink if there's one user.
29639 if (RematCost > 2)
29640 return 1U;
29641 llvm_unreachable("Unexpected remat cost");
29642 };
29643
29644 unsigned Opc = MI.getOpcode();
29645 switch (Opc) {
29646 case TargetOpcode::G_GLOBAL_VALUE: {
29647 // On Darwin, TLS global vars get selected into function calls, which
29648 // we don't want localized, as they can get moved into the middle of a
29649 // another call sequence.
29650 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
29651 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
29652 return false;
29653 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
29654 }
29655 case TargetOpcode::G_FCONSTANT:
29656 case TargetOpcode::G_CONSTANT: {
29657 const ConstantInt *CI;
29658 unsigned AdditionalCost = 0;
29659
29660 if (Opc == TargetOpcode::G_CONSTANT)
29661 CI = MI.getOperand(1).getCImm();
29662 else {
29663 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29664 // We try to estimate cost of 32/64b fpimms, as they'll likely be
29665 // materialized as integers.
29666 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
29667 break;
29668 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
29669 bool OptForSize = MF.getFunction().hasOptSize();
29671 OptForSize))
29672 return true; // Constant should be cheap.
29673 CI =
29674 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
29675 // FP materialization also costs an extra move, from gpr to fpr.
29676 AdditionalCost = 1;
29677 }
29678 APInt Imm = CI->getValue();
29681 assert(Cost.isValid() && "Expected a valid imm cost");
29682
29683 unsigned RematCost = Cost.getValue();
29684 RematCost += AdditionalCost;
29685 Register Reg = MI.getOperand(0).getReg();
29686 unsigned MaxUses = maxUses(RematCost);
29687 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
29688 if (MaxUses == std::numeric_limits<unsigned>::max())
29689 --MaxUses;
29690 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
29691 }
29692 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
29693 // localizable.
29694 case AArch64::ADRP:
29695 case AArch64::G_ADD_LOW:
29696 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
29697 case TargetOpcode::G_PTR_ADD:
29698 return true;
29699 default:
29700 break;
29701 }
29703}
29704
29706 // Fallback for scalable vectors.
29707 // Note that if EnableSVEGISel is true, we allow scalable vector types for
29708 // all instructions, regardless of whether they are actually supported.
29709 if (!EnableSVEGISel) {
29710 if (Inst.getType()->isScalableTy()) {
29711 return true;
29712 }
29713
29714 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
29715 if (Inst.getOperand(i)->getType()->isScalableTy())
29716 return true;
29717
29718 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
29719 if (AI->getAllocatedType()->isScalableTy())
29720 return true;
29721 }
29722 }
29723
29724 // Checks to allow the use of SME instructions
29725 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
29726 auto CallAttrs = SMECallAttrs(*Base, this);
29727 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
29728 CallAttrs.requiresPreservingZT0() ||
29729 CallAttrs.requiresPreservingAllZAState())
29730 return true;
29731 }
29732 return false;
29733}
29734
29735// Return the largest legal scalable vector type that matches VT's element type.
29739 "Expected legal fixed length vector!");
29740 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29741 default:
29742 llvm_unreachable("unexpected element type for SVE container");
29743 case MVT::i8:
29744 return EVT(MVT::nxv16i8);
29745 case MVT::i16:
29746 return EVT(MVT::nxv8i16);
29747 case MVT::i32:
29748 return EVT(MVT::nxv4i32);
29749 case MVT::i64:
29750 return EVT(MVT::nxv2i64);
29751 case MVT::bf16:
29752 return EVT(MVT::nxv8bf16);
29753 case MVT::f16:
29754 return EVT(MVT::nxv8f16);
29755 case MVT::f32:
29756 return EVT(MVT::nxv4f32);
29757 case MVT::f64:
29758 return EVT(MVT::nxv2f64);
29759 }
29760}
29761
29762// Return a predicate with active lanes corresponding to the extent of VT.
29764 EVT VT) {
29767 "Expected legal fixed length vector!");
29768
29769 std::optional<unsigned> PgPattern =
29771 assert(PgPattern && "Unexpected element count for SVE predicate");
29772
29773 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
29774 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
29775 // variants of instructions when available.
29776 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29777 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29778 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29779 if (MaxSVESize && MinSVESize == MaxSVESize &&
29780 MaxSVESize == VT.getSizeInBits())
29781 PgPattern = AArch64SVEPredPattern::all;
29782
29783 MVT MaskVT;
29784 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29785 default:
29786 llvm_unreachable("unexpected element type for SVE predicate");
29787 case MVT::i8:
29788 MaskVT = MVT::nxv16i1;
29789 break;
29790 case MVT::i16:
29791 case MVT::f16:
29792 case MVT::bf16:
29793 MaskVT = MVT::nxv8i1;
29794 break;
29795 case MVT::i32:
29796 case MVT::f32:
29797 MaskVT = MVT::nxv4i1;
29798 break;
29799 case MVT::i64:
29800 case MVT::f64:
29801 MaskVT = MVT::nxv2i1;
29802 break;
29803 }
29804
29805 return getPTrue(DAG, DL, MaskVT, *PgPattern);
29806}
29807
29809 EVT VT) {
29811 "Expected legal scalable vector!");
29812 auto PredTy = VT.changeVectorElementType(MVT::i1);
29813 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
29814}
29815
29817 if (VT.isFixedLengthVector())
29818 return getPredicateForFixedLengthVector(DAG, DL, VT);
29819
29820 return getPredicateForScalableVector(DAG, DL, VT);
29821}
29822
29823// Grow V to consume an entire SVE register.
29825 assert(VT.isScalableVector() &&
29826 "Expected to convert into a scalable vector!");
29827 assert(V.getValueType().isFixedLengthVector() &&
29828 "Expected a fixed length vector operand!");
29829 SDLoc DL(V);
29830 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29831 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
29832}
29833
29834// Shrink V so it's just big enough to maintain a VT's worth of data.
29837 "Expected to convert into a fixed length vector!");
29838 assert(V.getValueType().isScalableVector() &&
29839 "Expected a scalable vector operand!");
29840 SDLoc DL(V);
29841 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29842 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
29843}
29844
29845// Convert all fixed length vector loads larger than NEON to masked_loads.
29846SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
29847 SDValue Op, SelectionDAG &DAG) const {
29848 auto Load = cast<LoadSDNode>(Op);
29849
29850 SDLoc DL(Op);
29851 EVT VT = Op.getValueType();
29852 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29853 EVT LoadVT = ContainerVT;
29854 EVT MemVT = Load->getMemoryVT();
29855
29856 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29857
29858 if (VT.isFloatingPoint()) {
29859 LoadVT = ContainerVT.changeTypeToInteger();
29860 MemVT = MemVT.changeTypeToInteger();
29861 }
29862
29863 SDValue NewLoad = DAG.getMaskedLoad(
29864 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
29865 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
29866 Load->getAddressingMode(), Load->getExtensionType());
29867
29868 SDValue Result = NewLoad;
29869 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
29870 EVT ExtendVT = ContainerVT.changeVectorElementType(
29871 Load->getMemoryVT().getVectorElementType());
29872
29873 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
29874 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29875 Pg, Result, DAG.getUNDEF(ContainerVT));
29876 } else if (VT.isFloatingPoint()) {
29877 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
29878 }
29879
29880 Result = convertFromScalableVector(DAG, VT, Result);
29881 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29882 return DAG.getMergeValues(MergedValues, DL);
29883}
29884
29886 SelectionDAG &DAG) {
29887 SDLoc DL(Mask);
29888 EVT InVT = Mask.getValueType();
29889 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29891
29892 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
29893 return Pg;
29894
29895 bool InvertCond = false;
29896 if (isBitwiseNot(Mask)) {
29897 InvertCond = true;
29898 Mask = Mask.getOperand(0);
29899 }
29900
29901 SDValue Op1, Op2;
29902 ISD::CondCode CC;
29903
29904 // When Mask is the result of a SETCC, it's better to regenerate the compare.
29905 if (Mask.getOpcode() == ISD::SETCC) {
29906 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
29907 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
29908 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
29909 } else {
29910 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
29911 Op2 = DAG.getConstant(0, DL, ContainerVT);
29912 CC = ISD::SETNE;
29913 }
29914
29915 if (InvertCond)
29916 CC = getSetCCInverse(CC, Op1.getValueType());
29917
29918 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
29919 {Pg, Op1, Op2, DAG.getCondCode(CC)});
29920}
29921
29922// Convert all fixed length vector loads larger than NEON to masked_loads.
29923SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
29924 SDValue Op, SelectionDAG &DAG) const {
29926
29927 SDLoc DL(Op);
29928 EVT VT = Op.getValueType();
29929 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29930
29931 SDValue Mask = Load->getMask();
29932 // If this is an extending load and the mask type is not the same as
29933 // load's type then we have to extend the mask type.
29934 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
29935 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
29936 "Incorrect mask type");
29937 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
29938 }
29940
29941 SDValue PassThru;
29942 bool IsPassThruZeroOrUndef = false;
29943
29944 if (Load->getPassThru()->isUndef()) {
29945 PassThru = DAG.getUNDEF(ContainerVT);
29946 IsPassThruZeroOrUndef = true;
29947 } else {
29948 if (ContainerVT.isInteger())
29949 PassThru = DAG.getConstant(0, DL, ContainerVT);
29950 else
29951 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
29952 if (isZerosVector(Load->getPassThru().getNode()))
29953 IsPassThruZeroOrUndef = true;
29954 }
29955
29956 SDValue NewLoad = DAG.getMaskedLoad(
29957 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
29958 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
29959 Load->getAddressingMode(), Load->getExtensionType());
29960
29961 SDValue Result = NewLoad;
29962 if (!IsPassThruZeroOrUndef) {
29963 SDValue OldPassThru =
29964 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
29965 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
29966 }
29967
29968 Result = convertFromScalableVector(DAG, VT, Result);
29969 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29970 return DAG.getMergeValues(MergedValues, DL);
29971}
29972
29973// Convert all fixed length vector stores larger than NEON to masked_stores.
29974SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
29975 SDValue Op, SelectionDAG &DAG) const {
29976 auto Store = cast<StoreSDNode>(Op);
29977
29978 SDLoc DL(Op);
29979 EVT VT = Store->getValue().getValueType();
29980 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29981 EVT MemVT = Store->getMemoryVT();
29982
29983 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29984 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29985
29986 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
29987 EVT TruncVT = ContainerVT.changeVectorElementType(
29988 Store->getMemoryVT().getVectorElementType());
29989 MemVT = MemVT.changeTypeToInteger();
29990 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
29991 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
29992 DAG.getUNDEF(TruncVT));
29993 NewValue =
29994 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29995 } else if (VT.isFloatingPoint()) {
29996 MemVT = MemVT.changeTypeToInteger();
29997 NewValue =
29998 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29999 }
30000
30001 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
30002 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
30003 Store->getMemOperand(), Store->getAddressingMode(),
30004 Store->isTruncatingStore());
30005}
30006
30007SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
30008 SDValue Op, SelectionDAG &DAG) const {
30010
30011 SDLoc DL(Op);
30012 EVT VT = Store->getValue().getValueType();
30013 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30014
30015 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
30017
30018 return DAG.getMaskedStore(
30019 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
30020 Mask, Store->getMemoryVT(), Store->getMemOperand(),
30021 Store->getAddressingMode(), Store->isTruncatingStore());
30022}
30023
30024SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
30025 SDValue Op, SelectionDAG &DAG) const {
30026 SDLoc DL(Op);
30027 EVT VT = Op.getValueType();
30028 EVT EltVT = VT.getVectorElementType();
30029
30030 bool Signed = Op.getOpcode() == ISD::SDIV;
30031 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
30032
30033 bool Negated;
30034 uint64_t SplatVal;
30035 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
30036 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30037 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30038 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
30039
30041 SDValue Res =
30042 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
30043 if (Negated)
30044 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
30045 DAG.getConstant(0, DL, ContainerVT), Res);
30046
30047 return convertFromScalableVector(DAG, VT, Res);
30048 }
30049
30050 // Scalable vector i32/i64 DIV is supported.
30051 if (EltVT == MVT::i32 || EltVT == MVT::i64)
30052 return LowerToPredicatedOp(Op, DAG, PredOpcode);
30053
30054 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
30055 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
30056 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
30057 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30058
30059 // If the wider type is legal: extend, op, and truncate.
30060 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
30061 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
30062 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
30063 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
30064 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
30065 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
30066 }
30067
30068 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
30069 &ExtendOpcode](SDValue Op) {
30070 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
30071 SDValue IdxHalf =
30072 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
30073 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
30074 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
30075 return std::pair<SDValue, SDValue>(
30076 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
30077 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
30078 };
30079
30080 // If wider type is not legal: split, extend, op, trunc and concat.
30081 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
30082 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
30083 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
30084 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
30085 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
30086 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
30087 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
30088}
30089
30090SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
30091 SDValue Op, SelectionDAG &DAG) const {
30092 EVT VT = Op.getValueType();
30093 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30094
30095 SDLoc DL(Op);
30096 SDValue Val = Op.getOperand(0);
30097 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30098 Val = convertToScalableVector(DAG, ContainerVT, Val);
30099
30100 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
30101 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
30102
30103 // Repeatedly unpack Val until the result is of the desired element type.
30104 switch (ContainerVT.getSimpleVT().SimpleTy) {
30105 default:
30106 llvm_unreachable("unimplemented container type");
30107 case MVT::nxv16i8:
30108 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
30109 if (VT.getVectorElementType() == MVT::i16)
30110 break;
30111 [[fallthrough]];
30112 case MVT::nxv8i16:
30113 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
30114 if (VT.getVectorElementType() == MVT::i32)
30115 break;
30116 [[fallthrough]];
30117 case MVT::nxv4i32:
30118 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
30119 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
30120 break;
30121 }
30122
30123 return convertFromScalableVector(DAG, VT, Val);
30124}
30125
30126SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
30127 SDValue Op, SelectionDAG &DAG) const {
30128 EVT VT = Op.getValueType();
30129 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30130
30131 SDLoc DL(Op);
30132 SDValue Val = Op.getOperand(0);
30133 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30134 Val = convertToScalableVector(DAG, ContainerVT, Val);
30135
30136 // Repeatedly truncate Val until the result is of the desired element type.
30137 switch (ContainerVT.getSimpleVT().SimpleTy) {
30138 default:
30139 llvm_unreachable("unimplemented container type");
30140 case MVT::nxv2i64:
30141 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
30142 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
30143 if (VT.getVectorElementType() == MVT::i32)
30144 break;
30145 [[fallthrough]];
30146 case MVT::nxv4i32:
30147 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
30148 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
30149 if (VT.getVectorElementType() == MVT::i16)
30150 break;
30151 [[fallthrough]];
30152 case MVT::nxv8i16:
30153 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
30154 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
30155 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
30156 break;
30157 }
30158
30159 return convertFromScalableVector(DAG, VT, Val);
30160}
30161
30162SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
30163 SDValue Op, SelectionDAG &DAG) const {
30164 EVT VT = Op.getValueType();
30165 EVT InVT = Op.getOperand(0).getValueType();
30166 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
30167
30168 SDLoc DL(Op);
30169 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30170 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30171
30172 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
30173}
30174
30175SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
30176 SDValue Op, SelectionDAG &DAG) const {
30177 EVT VT = Op.getValueType();
30178 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30179
30180 SDLoc DL(Op);
30181 EVT InVT = Op.getOperand(0).getValueType();
30182 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30183 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30184
30185 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
30186 Op.getOperand(1), Op.getOperand(2));
30187
30188 return convertFromScalableVector(DAG, VT, ScalableRes);
30189}
30190
30191// Convert vector operation 'Op' to an equivalent predicated operation whereby
30192// the original operation's type is used to construct a suitable predicate.
30193// NOTE: The results for inactive lanes are undefined.
30194SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
30195 SelectionDAG &DAG,
30196 unsigned NewOp) const {
30197 EVT VT = Op.getValueType();
30198 SDLoc DL(Op);
30199 auto Pg = getPredicateForVector(DAG, DL, VT);
30200
30201 if (VT.isFixedLengthVector()) {
30202 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
30203 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30204
30205 // Create list of operands by converting existing ones to scalable types.
30207 for (const SDValue &V : Op->op_values()) {
30208 if (isa<CondCodeSDNode>(V)) {
30209 Operands.push_back(V);
30210 continue;
30211 }
30212
30213 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
30214 EVT VTArg = VTNode->getVT().getVectorElementType();
30215 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
30216 Operands.push_back(DAG.getValueType(NewVTArg));
30217 continue;
30218 }
30219
30220 assert(isTypeLegal(V.getValueType()) &&
30221 "Expected only legal fixed-width types");
30222 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
30223 }
30224
30225 if (isMergePassthruOpcode(NewOp))
30226 Operands.push_back(DAG.getUNDEF(ContainerVT));
30227
30228 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
30229 return convertFromScalableVector(DAG, VT, ScalableRes);
30230 }
30231
30232 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
30233
30235 for (const SDValue &V : Op->op_values()) {
30236 assert((!V.getValueType().isVector() ||
30237 V.getValueType().isScalableVector()) &&
30238 "Only scalable vectors are supported!");
30239 Operands.push_back(V);
30240 }
30241
30242 if (isMergePassthruOpcode(NewOp))
30243 Operands.push_back(DAG.getUNDEF(VT));
30244
30245 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
30246}
30247
30248// If a fixed length vector operation has no side effects when applied to
30249// undefined elements, we can safely use scalable vectors to perform the same
30250// operation without needing to worry about predication.
30251SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
30252 SelectionDAG &DAG) const {
30253 EVT VT = Op.getValueType();
30255 "Only expected to lower fixed length vector operation!");
30256 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30257
30258 // Create list of operands by converting existing ones to scalable types.
30260 for (const SDValue &V : Op->op_values()) {
30261 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
30262
30263 // Pass through non-vector operands.
30264 if (!V.getValueType().isVector()) {
30265 Ops.push_back(V);
30266 continue;
30267 }
30268
30269 // "cast" fixed length vector to a scalable vector.
30270 assert(V.getValueType().isFixedLengthVector() &&
30271 isTypeLegal(V.getValueType()) &&
30272 "Only fixed length vectors are supported!");
30273 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
30274 }
30275
30276 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
30277 return convertFromScalableVector(DAG, VT, ScalableRes);
30278}
30279
30280SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
30281 SelectionDAG &DAG) const {
30282 SDLoc DL(ScalarOp);
30283 SDValue AccOp = ScalarOp.getOperand(0);
30284 SDValue VecOp = ScalarOp.getOperand(1);
30285 EVT SrcVT = VecOp.getValueType();
30286 EVT ResVT = SrcVT.getVectorElementType();
30287
30288 EVT ContainerVT = SrcVT;
30289 if (SrcVT.isFixedLengthVector()) {
30290 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30291 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30292 }
30293
30294 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30295 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30296
30297 // Convert operands to Scalable.
30298 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
30299 DAG.getUNDEF(ContainerVT), AccOp, Zero);
30300
30301 // Perform reduction.
30302 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
30303 Pg, AccOp, VecOp);
30304
30305 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
30306}
30307
30308SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
30309 SelectionDAG &DAG) const {
30310 SDLoc DL(ReduceOp);
30311 SDValue Op = ReduceOp.getOperand(0);
30312 EVT OpVT = Op.getValueType();
30313 EVT VT = ReduceOp.getValueType();
30314
30315 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
30316 return SDValue();
30317
30318 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
30319
30320 switch (ReduceOp.getOpcode()) {
30321 default:
30322 return SDValue();
30323 case ISD::VECREDUCE_OR:
30324 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
30325 // The predicate can be 'Op' because
30326 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
30327 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
30328 else
30329 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
30330 case ISD::VECREDUCE_AND: {
30331 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
30332 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
30333 }
30334 case ISD::VECREDUCE_XOR: {
30335 SDValue ID =
30336 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
30337 if (OpVT == MVT::nxv1i1) {
30338 // Emulate a CNTP on .Q using .D and a different governing predicate.
30339 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
30340 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
30341 }
30342 SDValue Cntp =
30343 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
30344 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
30345 }
30346 }
30347
30348 return SDValue();
30349}
30350
30351SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
30352 SDValue ScalarOp,
30353 SelectionDAG &DAG) const {
30354 SDLoc DL(ScalarOp);
30355 SDValue VecOp = ScalarOp.getOperand(0);
30356 EVT SrcVT = VecOp.getValueType();
30357
30359 SrcVT,
30360 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
30361 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30362 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30363 }
30364
30365 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
30366 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
30367 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
30368 SDValue BoolVec = VecOp.getOperand(0);
30369 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
30370 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
30371 SDValue CntpOp = DAG.getNode(
30372 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
30373 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
30374 BoolVec, BoolVec);
30375 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
30376 }
30377 }
30378
30379 // UADDV always returns an i64 result.
30380 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
30381 SrcVT.getVectorElementType();
30382 EVT RdxVT = SrcVT;
30383 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
30384 RdxVT = getPackedSVEVectorVT(ResVT);
30385
30386 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30387 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
30388 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
30389 Rdx, DAG.getConstant(0, DL, MVT::i64));
30390
30391 // The VEC_REDUCE nodes expect an element size result.
30392 if (ResVT != ScalarOp.getValueType())
30393 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
30394
30395 return Res;
30396}
30397
30398SDValue
30399AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
30400 SelectionDAG &DAG) const {
30401 EVT VT = Op.getValueType();
30402 SDLoc DL(Op);
30403
30404 EVT InVT = Op.getOperand(1).getValueType();
30405 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30406 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
30407 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
30408
30409 // Convert the mask to a predicated (NOTE: We don't need to worry about
30410 // inactive lanes since VSELECT is safe when given undefined elements).
30411 EVT MaskVT = Op.getOperand(0).getValueType();
30412 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
30413 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
30415 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
30416
30417 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
30418 Mask, Op1, Op2);
30419
30420 return convertFromScalableVector(DAG, VT, ScalableRes);
30421}
30422
30423SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
30424 SDValue Op, SelectionDAG &DAG) const {
30425 SDLoc DL(Op);
30426 EVT InVT = Op.getOperand(0).getValueType();
30427 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30428
30429 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
30430 "Only expected to lower fixed length vector operation!");
30431 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
30432 "Expected integer result of the same bit length as the inputs!");
30433
30434 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30435 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
30436 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
30437
30438 EVT CmpVT = Pg.getValueType();
30439 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
30440 {Pg, Op1, Op2, Op.getOperand(2)});
30441
30442 EVT PromoteVT = ContainerVT.changeTypeToInteger();
30443 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
30444 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
30445}
30446
30447SDValue
30448AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
30449 SelectionDAG &DAG) const {
30450 SDLoc DL(Op);
30451 auto SrcOp = Op.getOperand(0);
30452 EVT VT = Op.getValueType();
30453 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30454 EVT ContainerSrcVT =
30456
30457 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
30458 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
30459 return convertFromScalableVector(DAG, VT, Op);
30460}
30461
30462SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
30463 SDValue Op, SelectionDAG &DAG) const {
30464 SDLoc DL(Op);
30465 unsigned NumOperands = Op->getNumOperands();
30466
30467 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
30468 "Unexpected number of operands in CONCAT_VECTORS");
30469
30470 auto SrcOp1 = Op.getOperand(0);
30471 auto SrcOp2 = Op.getOperand(1);
30472 EVT VT = Op.getValueType();
30473 EVT SrcVT = SrcOp1.getValueType();
30474
30475 // Match a splat of 128b segments that fit in a single register.
30476 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
30477 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30478 SDValue Splat =
30479 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
30480 convertToScalableVector(DAG, ContainerVT, SrcOp1),
30481 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
30482 return convertFromScalableVector(DAG, VT, Splat);
30483 }
30484
30485 if (NumOperands > 2) {
30487 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
30488 for (unsigned I = 0; I < NumOperands; I += 2)
30489 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
30490 Op->getOperand(I), Op->getOperand(I + 1)));
30491
30492 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
30493 }
30494
30495 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30496
30498 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
30499 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
30500
30501 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
30502
30503 return convertFromScalableVector(DAG, VT, Op);
30504}
30505
30506SDValue
30507AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
30508 SelectionDAG &DAG) const {
30509 EVT VT = Op.getValueType();
30510 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30511
30512 SDLoc DL(Op);
30513 SDValue Val = Op.getOperand(0);
30514 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30515 EVT SrcVT = Val.getValueType();
30516 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30517 EVT ExtendVT = ContainerVT.changeVectorElementType(
30518 SrcVT.getVectorElementType());
30519
30520 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30521 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
30522
30523 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
30524 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
30525 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30526 Pg, Val, DAG.getUNDEF(ContainerVT));
30527
30528 return convertFromScalableVector(DAG, VT, Val);
30529}
30530
30531SDValue
30532AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
30533 SelectionDAG &DAG) const {
30534 EVT VT = Op.getValueType();
30535 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30536
30537 SDLoc DL(Op);
30538 SDValue Val = Op.getOperand(0);
30539 EVT SrcVT = Val.getValueType();
30540 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30541 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
30543 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
30544
30545 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30546 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
30547 Op.getOperand(1), DAG.getUNDEF(RoundVT));
30548 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
30549 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30550
30551 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30552 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30553}
30554
30555SDValue
30556AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
30557 SelectionDAG &DAG) const {
30558 EVT VT = Op.getValueType();
30559 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30560
30561 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
30562 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
30563 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
30564
30565 SDLoc DL(Op);
30566 SDValue Val = Op.getOperand(0);
30567 EVT SrcVT = Val.getValueType();
30568 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30569 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30570
30571 if (VT.bitsGE(SrcVT)) {
30573
30574 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
30575 VT.changeTypeToInteger(), Val);
30576
30577 // Safe to use a larger than specified operand because by promoting the
30578 // value nothing has changed from an arithmetic point of view.
30579 Val =
30580 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
30581 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30582 DAG.getUNDEF(ContainerDstVT));
30583 return convertFromScalableVector(DAG, VT, Val);
30584 } else {
30585 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
30586 ContainerDstVT.getVectorElementType());
30588
30589 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30590 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30591 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
30592 Val = convertFromScalableVector(DAG, SrcVT, Val);
30593
30594 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30595 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30596 }
30597}
30598
30599SDValue
30600AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
30601 SelectionDAG &DAG) const {
30602 SDLoc DL(Op);
30603 EVT OpVT = Op.getValueType();
30604 assert(OpVT.isScalableVector() &&
30605 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
30606
30607 // Are multi-register uzp instructions available?
30608 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30609 OpVT.getVectorElementType() != MVT::i1) {
30610 Intrinsic::ID IntID;
30611 switch (Op->getNumOperands()) {
30612 default:
30613 return SDValue();
30614 case 2:
30615 IntID = Intrinsic::aarch64_sve_uzp_x2;
30616 break;
30617 case 4:
30618 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30619 OpVT.getScalarSizeInBits() == 64)
30620 return SDValue();
30621 IntID = Intrinsic::aarch64_sve_uzp_x4;
30622 break;
30623 }
30624
30626 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30627 Ops.append(Op->op_values().begin(), Op->op_values().end());
30628 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30629 }
30630
30631 if (Op->getNumOperands() != 2)
30632 return SDValue();
30633
30634 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
30635 Op.getOperand(1));
30636 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
30637 Op.getOperand(1));
30638 return DAG.getMergeValues({Even, Odd}, DL);
30639}
30640
30641SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
30642 SelectionDAG &DAG) const {
30643 SDLoc DL(Op);
30644 EVT OpVT = Op.getValueType();
30645 assert(OpVT.isScalableVector() &&
30646 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
30647
30648 // Are multi-register zip instructions available?
30649 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30650 OpVT.getVectorElementType() != MVT::i1) {
30651 Intrinsic::ID IntID;
30652 switch (Op->getNumOperands()) {
30653 default:
30654 return SDValue();
30655 case 2:
30656 IntID = Intrinsic::aarch64_sve_zip_x2;
30657 break;
30658 case 4:
30659 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30660 OpVT.getScalarSizeInBits() == 64)
30661 return SDValue();
30662 IntID = Intrinsic::aarch64_sve_zip_x4;
30663 break;
30664 }
30665
30667 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30668 Ops.append(Op->op_values().begin(), Op->op_values().end());
30669 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30670 }
30671
30672 if (Op->getNumOperands() != 2)
30673 return SDValue();
30674
30675 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
30676 Op.getOperand(1));
30677 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
30678 Op.getOperand(1));
30679 return DAG.getMergeValues({Lo, Hi}, DL);
30680}
30681
30682SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
30683 SelectionDAG &DAG) const {
30684 // FIXME: Maybe share some code with LowerMGather/Scatter?
30685 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
30686 SDLoc DL(HG);
30687 SDValue Chain = HG->getChain();
30688 SDValue Inc = HG->getInc();
30689 SDValue Mask = HG->getMask();
30690 SDValue Ptr = HG->getBasePtr();
30691 SDValue Index = HG->getIndex();
30692 SDValue Scale = HG->getScale();
30693 SDValue IntID = HG->getIntID();
30694
30695 // The Intrinsic ID determines the type of update operation.
30696 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
30697 // Right now, we only support 'add' as an update.
30698 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
30699 "Unexpected histogram update operation");
30700
30701 EVT IndexVT = Index.getValueType();
30702 LLVMContext &Ctx = *DAG.getContext();
30703 ElementCount EC = IndexVT.getVectorElementCount();
30704 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
30705 EVT IncExtVT =
30706 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
30707 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
30708 bool ExtTrunc = IncSplatVT != MemVT;
30709
30710 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30711 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
30712 SDValue IncSplat = DAG.getSplatVector(
30713 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
30714 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
30715
30716 MachineMemOperand *MMO = HG->getMemOperand();
30717 // Create an MMO for the gather, without load|store flags.
30718 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
30720 MMO->getAlign(), MMO->getAAInfo());
30721 ISD::MemIndexType IndexType = HG->getIndexType();
30722 SDValue Gather = DAG.getMaskedGather(
30723 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
30724 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
30725
30726 SDValue GChain = Gather.getValue(1);
30727
30728 // Perform the histcnt, multiply by inc, add to bucket data.
30729 SDValue ID =
30730 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
30731 SDValue HistCnt =
30732 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
30733 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
30734 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
30735
30736 // Create an MMO for the scatter, without load|store flags.
30737 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
30739 MMO->getAlign(), MMO->getAAInfo());
30740
30741 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
30742 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
30743 ScatterOps, SMMO, IndexType, ExtTrunc);
30744 return Scatter;
30745}
30746
30747/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
30748/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
30749/// however still make use of the dot product instruction by instead
30750/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
30751/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
30752/// the following pattern is emitted:
30753/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
30754/// NTy/2))))
30755SDValue
30756AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
30757 SelectionDAG &DAG) const {
30758 SDLoc DL(Op);
30759
30760 SDValue Acc = Op.getOperand(0);
30761 SDValue LHS = Op.getOperand(1);
30762 SDValue RHS = Op.getOperand(2);
30763 EVT ResultVT = Op.getValueType();
30764 EVT OrigResultVT = ResultVT;
30765 EVT OpVT = LHS.getValueType();
30766
30767 bool ConvertToScalable =
30768 ResultVT.isFixedLengthVector() &&
30769 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
30770
30771 if (ConvertToScalable) {
30772 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
30773 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
30774 Acc = convertToScalableVector(DAG, ResultVT, Acc);
30775 LHS = convertToScalableVector(DAG, OpVT, LHS);
30776 RHS = convertToScalableVector(DAG, OpVT, RHS);
30777 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
30778 }
30779
30780 // Two-way and four-way partial reductions are supported by patterns.
30781 // We only need to handle the 8-way partial reduction.
30782 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
30783 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
30784 : Op;
30785
30786 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
30787 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
30788 DAG.getConstant(0, DL, DotVT), LHS, RHS);
30789
30790 SDValue Res;
30791 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
30792 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
30793 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
30794 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
30795 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
30796 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
30797 } else {
30798 // Fold (nx)v4i32 into (nx)v2i64
30799 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
30800 if (IsUnsigned) {
30801 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
30802 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
30803 } else {
30804 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
30805 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
30806 }
30807 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
30808 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
30809 }
30810
30811 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
30812 : Res;
30813}
30814
30815SDValue
30816AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
30817 SelectionDAG &DAG) const {
30818 EVT VT = Op.getValueType();
30819 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30820
30821 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
30822 "Lowering fixed length get_active_lane_mask requires SVE!");
30823
30824 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
30825 // but we can use SVE when available.
30826
30827 SDLoc DL(Op);
30828 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30829 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
30830
30831 SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,
30832 Op.getOperand(0), Op.getOperand(1));
30833 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
30834 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
30835 DAG.getVectorIdxConstant(0, DL));
30836}
30837
30838SDValue
30839AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
30840 SelectionDAG &DAG) const {
30841 EVT VT = Op.getValueType();
30842 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30843
30844 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
30845 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
30846 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
30847
30848 SDLoc DL(Op);
30849 SDValue Val = Op.getOperand(0);
30850 EVT SrcVT = Val.getValueType();
30851 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30852 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30853
30854 if (VT.bitsGT(SrcVT)) {
30855 EVT CvtVT = ContainerDstVT.changeVectorElementType(
30856 ContainerSrcVT.getVectorElementType());
30858
30859 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30860 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
30861
30862 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
30863 Val = getSVESafeBitCast(CvtVT, Val, DAG);
30864 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30865 DAG.getUNDEF(ContainerDstVT));
30866 return convertFromScalableVector(DAG, VT, Val);
30867 } else {
30868 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
30870
30871 // Safe to use a larger than specified result since an fp_to_int where the
30872 // result doesn't fit into the destination is undefined.
30873 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30874 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30875 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30876
30877 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
30878 }
30879}
30880
30882 ArrayRef<int> ShuffleMask, EVT VT,
30883 EVT ContainerVT, SelectionDAG &DAG) {
30884 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30885 SDLoc DL(Op);
30886 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30887 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30888 bool IsSingleOp =
30889 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
30890
30891 if (!Subtarget.isNeonAvailable() && !MinSVESize)
30892 MinSVESize = 128;
30893
30894 // Ignore two operands if no SVE2 or all index numbers couldn't
30895 // be represented.
30896 if (!IsSingleOp && !Subtarget.hasSVE2())
30897 return SDValue();
30898
30899 EVT VTOp1 = Op.getOperand(0).getValueType();
30900 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
30901 unsigned IndexLen = MinSVESize / BitsPerElt;
30902 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
30903 uint64_t MaxOffset = maxUIntN(BitsPerElt);
30904 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
30905 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
30906 bool MinMaxEqual = (MinSVESize == MaxSVESize);
30907 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
30908 "Incorrectly legalised shuffle operation");
30909
30911 // If MinSVESize is not equal to MaxSVESize then we need to know which
30912 // TBL mask element needs adjustment.
30913 SmallVector<SDValue, 8> AddRuntimeVLMask;
30914
30915 // Bail out for 8-bits element types, because with 2048-bit SVE register
30916 // size 8 bits is only sufficient to index into the first source vector.
30917 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
30918 return SDValue();
30919
30920 for (int Index : ShuffleMask) {
30921 // Handling poison index value.
30922 if (Index < 0)
30923 Index = 0;
30924 // If the mask refers to elements in the second operand, then we have to
30925 // offset the index by the number of elements in a vector. If this is number
30926 // is not known at compile-time, we need to maintain a mask with 'VL' values
30927 // to add at runtime.
30928 if ((unsigned)Index >= ElementsPerVectorReg) {
30929 if (MinMaxEqual) {
30930 Index += IndexLen - ElementsPerVectorReg;
30931 } else {
30932 Index = Index - ElementsPerVectorReg;
30933 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
30934 }
30935 } else if (!MinMaxEqual)
30936 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30937 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
30938 // to 255, this might point to the last element of in the second operand
30939 // of the shufflevector, thus we are rejecting this transform.
30940 if ((unsigned)Index >= MaxOffset)
30941 return SDValue();
30942 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
30943 }
30944
30945 // Choosing an out-of-range index leads to the lane being zeroed vs zero
30946 // value where it would perform first lane duplication for out of
30947 // index elements. For i8 elements an out-of-range index could be a valid
30948 // for 2048-bit vector register size.
30949 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
30950 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
30951 if (!MinMaxEqual)
30952 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30953 }
30954
30955 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
30956 SDValue VecMask =
30957 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30958 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
30959
30960 SDValue Shuffle;
30961 if (IsSingleOp)
30962 Shuffle =
30963 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30964 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
30965 Op1, SVEMask);
30966 else if (Subtarget.hasSVE2()) {
30967 if (!MinMaxEqual) {
30968 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
30969 SDValue VScale = (BitsPerElt == 64)
30970 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
30971 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
30972 SDValue VecMask =
30973 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30974 SDValue MulByMask = DAG.getNode(
30975 ISD::MUL, DL, MaskType,
30976 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
30977 DAG.getBuildVector(MaskType, DL,
30978 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
30979 SDValue UpdatedVecMask =
30980 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
30981 SVEMask = convertToScalableVector(
30982 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
30983 }
30984 Shuffle =
30985 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30986 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
30987 Op1, Op2, SVEMask);
30988 }
30989 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
30990 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
30991}
30992
30993SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
30994 SDValue Op, SelectionDAG &DAG) const {
30995 EVT VT = Op.getValueType();
30996 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30997
30998 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
30999 auto ShuffleMask = SVN->getMask();
31000
31001 SDLoc DL(Op);
31002 SDValue Op1 = Op.getOperand(0);
31003 SDValue Op2 = Op.getOperand(1);
31004
31005 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31006 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
31007 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
31008
31009 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
31010 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
31011 return MVT::i32;
31012 return ScalarTy;
31013 };
31014
31015 if (SVN->isSplat()) {
31016 unsigned Lane = std::max(0, SVN->getSplatIndex());
31017 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31018 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31019 DAG.getConstant(Lane, DL, MVT::i64));
31020 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
31021 return convertFromScalableVector(DAG, VT, Op);
31022 }
31023
31024 bool ReverseEXT = false;
31025 unsigned Imm;
31026 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
31027 Imm == VT.getVectorNumElements() - 1) {
31028 if (ReverseEXT)
31029 std::swap(Op1, Op2);
31030 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31031 SDValue Scalar = DAG.getNode(
31032 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31033 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
31034 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
31035 return convertFromScalableVector(DAG, VT, Op);
31036 }
31037
31038 unsigned EltSize = VT.getScalarSizeInBits();
31039 for (unsigned BlockSize : {64U, 32U, 16U}) {
31040 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
31041 unsigned RevOp;
31042 if (EltSize == 8)
31043 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
31044 else if (EltSize == 16)
31045 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
31046 else
31047 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
31048 EVT BlockedVT =
31050 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
31051 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
31052 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
31053 DAG.getUNDEF(BlockedVT));
31054 SDValue Container =
31055 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
31056 return convertFromScalableVector(DAG, VT, Container);
31057 }
31058 }
31059
31060 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
31061 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
31062 SDValue Pg = getPredicateForVector(DAG, DL, VT);
31063 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
31064 Pg, Op1, DAG.getUNDEF(ContainerVT));
31065 return convertFromScalableVector(DAG, VT, Revd);
31066 }
31067
31068 unsigned WhichResult;
31069 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
31070 WhichResult == 0)
31072 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
31073
31074 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
31075 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31077 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
31078 }
31079
31080 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
31082 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
31083
31084 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31085 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31087 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31088 }
31089
31090 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
31091 // represents the same logical operation as performed by a ZIP instruction. In
31092 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
31093 // equivalent to an AArch64 instruction. There's the extra component of
31094 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
31095 // only operated on 64/128bit vector types that have a direct mapping to a
31096 // target register and so an exact mapping is implied.
31097 // However, when using SVE for fixed length vectors, most legal vector types
31098 // are actually sub-vectors of a larger SVE register. When mapping
31099 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
31100 // how the mask's indices translate. Specifically, when the mapping requires
31101 // an exact meaning for a specific vector index (e.g. Index X is the last
31102 // vector element in the register) then such mappings are often only safe when
31103 // the exact SVE register size is know. The main exception to this is when
31104 // indices are logically relative to the first element of either
31105 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
31106 // when converting from fixed-length to scalable vector types (i.e. the start
31107 // of a fixed length vector is always the start of a scalable vector).
31108 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
31109 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
31110 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
31111 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
31112 Op2.isUndef()) {
31113 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
31114 return convertFromScalableVector(DAG, VT, Op);
31115 }
31116
31117 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
31118 WhichResult != 0)
31120 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
31121
31122 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
31123 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31125 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
31126 }
31127
31128 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
31130 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
31131
31132 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31133 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31135 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31136 }
31137
31138 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
31139 Subtarget->isSVEorStreamingSVEAvailable()) {
31141 "Unsupported SVE vector size");
31142
31144 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
31145 if (std::optional<unsigned> Lane =
31146 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
31147 SDValue IID =
31148 DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
31150 DAG, VT,
31151 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31152 {IID, Op1,
31153 DAG.getConstant(*Lane, DL, MVT::i64,
31154 /*isTarget=*/true)}));
31155 }
31156 }
31157 }
31158
31159 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
31160 // This may allow the shuffle to be matched as something cheaper like ZIP1.
31161 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
31162 return WideOp;
31163
31164 // Avoid producing TBL instruction if we don't know SVE register minimal size,
31165 // unless NEON is not available and we can assume minimal SVE register size is
31166 // 128-bits.
31167 if (MinSVESize || !Subtarget->isNeonAvailable())
31168 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
31169 DAG);
31170
31171 return SDValue();
31172}
31173
31174SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
31175 SelectionDAG &DAG) const {
31176 SDLoc DL(Op);
31177 EVT InVT = Op.getValueType();
31178
31179 assert(VT.isScalableVector() && isTypeLegal(VT) &&
31180 InVT.isScalableVector() && isTypeLegal(InVT) &&
31181 "Only expect to cast between legal scalable vector types!");
31182 assert(VT.getVectorElementType() != MVT::i1 &&
31183 InVT.getVectorElementType() != MVT::i1 &&
31184 "For predicate bitcasts, use getSVEPredicateBitCast");
31185
31186 if (InVT == VT)
31187 return Op;
31188
31189 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
31190 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
31191
31192 // Safe bitcasting between unpacked vector types of different element counts
31193 // is currently unsupported because the following is missing the necessary
31194 // work to ensure the result's elements live where they're supposed to within
31195 // an SVE register.
31196 // 01234567
31197 // e.g. nxv2i32 = XX??XX??
31198 // nxv4f16 = X?X?X?X?
31200 VT == PackedVT || InVT == PackedInVT) &&
31201 "Unexpected bitcast!");
31202
31203 // Pack input if required.
31204 if (InVT != PackedInVT)
31205 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
31206
31207 if (Subtarget->isLittleEndian() ||
31208 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
31209 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31210 else {
31211 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
31212 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
31213
31214 // Simulate the effect of casting through memory.
31215 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
31216 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
31217 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
31218 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
31219 if (PackedVTAsInt.getScalarSizeInBits() != 8)
31220 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
31221 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31222 }
31223
31224 // Unpack result if required.
31225 if (VT != PackedVT)
31226 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
31227
31228 return Op;
31229}
31230
31232 SDValue N) const {
31233 return ::isAllActivePredicate(DAG, N);
31234}
31235
31237 return ::getPromotedVTForPredicate(VT);
31238}
31239
31240bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
31241 SDValue Op, const APInt &OriginalDemandedBits,
31242 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
31243 unsigned Depth) const {
31244
31245 unsigned Opc = Op.getOpcode();
31246 switch (Opc) {
31247 case AArch64ISD::VSHL: {
31248 // Match (VSHL (VLSHR Val X) X)
31249 SDValue ShiftL = Op;
31250 SDValue ShiftR = Op->getOperand(0);
31251 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
31252 return false;
31253
31254 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
31255 return false;
31256
31257 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
31258 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
31259
31260 // Other cases can be handled as well, but this is not
31261 // implemented.
31262 if (ShiftRBits != ShiftLBits)
31263 return false;
31264
31265 unsigned ScalarSize = Op.getScalarValueSizeInBits();
31266 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
31267
31268 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
31269 APInt UnusedBits = ~OriginalDemandedBits;
31270
31271 if ((ZeroBits & UnusedBits) != ZeroBits)
31272 return false;
31273
31274 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
31275 // used - simplify to just Val.
31276 return TLO.CombineTo(Op, ShiftR->getOperand(0));
31277 }
31278 case AArch64ISD::BICi: {
31279 // Fold BICi if all destination bits already known to be zeroed
31280 SDValue Op0 = Op.getOperand(0);
31281 KnownBits KnownOp0 =
31282 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
31283 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
31284 APInt BitsToClear =
31285 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
31286 .trunc(KnownOp0.getBitWidth());
31287 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
31288 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
31289 return TLO.CombineTo(Op, Op0);
31290
31291 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
31292 return false;
31293 }
31295 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
31296 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
31297 if (!MaxSVEVectorSizeInBits)
31298 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
31299 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
31300 // The SVE count intrinsics don't support the multiplier immediate so we
31301 // don't have to account for that here. The value returned may be slightly
31302 // over the true required bits, as this is based on the "ALL" pattern. The
31303 // other patterns are also exposed by these intrinsics, but they all
31304 // return a value that's strictly less than "ALL".
31305 unsigned RequiredBits = llvm::bit_width(MaxElements);
31306 unsigned BitWidth = Known.Zero.getBitWidth();
31307 if (RequiredBits < BitWidth)
31308 Known.Zero.setHighBits(BitWidth - RequiredBits);
31309 return false;
31310 }
31311 }
31312 }
31313
31315 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
31316}
31317
31318bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
31319 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
31320 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
31321
31322 // TODO: Add more target nodes.
31323 switch (Op.getOpcode()) {
31324 case AArch64ISD::MOVI:
31325 case AArch64ISD::MOVIedit:
31326 case AArch64ISD::MOVImsl:
31327 case AArch64ISD::MOVIshift:
31328 case AArch64ISD::MVNImsl:
31329 case AArch64ISD::MVNIshift:
31330 case AArch64ISD::VASHR:
31331 case AArch64ISD::VLSHR:
31332 case AArch64ISD::VSHL:
31333 return false;
31334 }
31336 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
31337}
31338
31339bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
31340 return Op.getOpcode() == AArch64ISD::DUP ||
31341 Op.getOpcode() == AArch64ISD::MOVI ||
31342 Op.getOpcode() == AArch64ISD::MOVIshift ||
31343 Op.getOpcode() == AArch64ISD::MOVImsl ||
31344 Op.getOpcode() == AArch64ISD::MOVIedit ||
31345 Op.getOpcode() == AArch64ISD::MVNIshift ||
31346 Op.getOpcode() == AArch64ISD::MVNImsl ||
31347 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
31348 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
31349 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
31350 (Op.getOpcode() == ISD::FNEG &&
31351 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
31352 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
31353 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
31354 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
31356}
31357
31359 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
31360 Subtarget->hasComplxNum();
31361}
31362
31365 auto *VTy = dyn_cast<VectorType>(Ty);
31366 if (!VTy)
31367 return false;
31368
31369 // If the vector is scalable, SVE is enabled, implying support for complex
31370 // numbers. Otherwise, we need to ensure complex number support is available
31371 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
31372 return false;
31373
31374 auto *ScalarTy = VTy->getScalarType();
31375 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
31376
31377 // We can only process vectors that have a bit size of 128 or higher (with an
31378 // additional 64 bits for Neon). Additionally, these vectors must have a
31379 // power-of-2 size, as we later split them into the smallest supported size
31380 // and merging them back together after applying complex operation.
31381 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
31382 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
31383 !llvm::isPowerOf2_32(VTyWidth))
31384 return false;
31385
31386 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
31387 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
31388
31390 return ScalarWidth == 32 || ScalarWidth == 64;
31391 return 8 <= ScalarWidth && ScalarWidth <= 64;
31392 }
31393
31394 // CDot is not supported outside of scalable/sve scopes
31396 return false;
31397
31398 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
31399 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
31400}
31401
31404 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
31405 Value *Accumulator) const {
31406 VectorType *Ty = cast<VectorType>(InputA->getType());
31407 if (Accumulator == nullptr)
31409 bool IsScalable = Ty->isScalableTy();
31410 bool IsInt = Ty->getElementType()->isIntegerTy();
31411
31412 unsigned TyWidth =
31413 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
31414
31415 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
31416 "Vector type must be either 64 or a power of 2 that is at least 128");
31417
31418 if (TyWidth > 128) {
31419 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
31420 int AccStride = cast<VectorType>(Accumulator->getType())
31421 ->getElementCount()
31422 .getKnownMinValue() /
31423 2;
31424 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
31425 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
31426 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
31427 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
31428 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
31429 Value *LowerSplitAcc = nullptr;
31430 Value *UpperSplitAcc = nullptr;
31431 Type *FullTy = Ty;
31432 FullTy = Accumulator->getType();
31433 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
31434 cast<VectorType>(Accumulator->getType()));
31435 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
31436 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
31437 auto *LowerSplitInt = createComplexDeinterleavingIR(
31438 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
31439 auto *UpperSplitInt = createComplexDeinterleavingIR(
31440 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
31441
31442 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
31443 LowerSplitInt, uint64_t(0));
31444 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
31445 }
31446
31447 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
31448 if (IsScalable) {
31449 if (IsInt)
31450 return B.CreateIntrinsic(
31451 Intrinsic::aarch64_sve_cmla_x, Ty,
31452 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31453
31454 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31455 return B.CreateIntrinsic(
31456 Intrinsic::aarch64_sve_fcmla, Ty,
31457 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31458 }
31459
31460 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
31461 Intrinsic::aarch64_neon_vcmla_rot90,
31462 Intrinsic::aarch64_neon_vcmla_rot180,
31463 Intrinsic::aarch64_neon_vcmla_rot270};
31464
31465
31466 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
31467 {Accumulator, InputA, InputB});
31468 }
31469
31470 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
31471 if (IsScalable) {
31474 if (IsInt)
31475 return B.CreateIntrinsic(
31476 Intrinsic::aarch64_sve_cadd_x, Ty,
31477 {InputA, InputB, B.getInt32((int)Rotation * 90)});
31478
31479 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31480 return B.CreateIntrinsic(
31481 Intrinsic::aarch64_sve_fcadd, Ty,
31482 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
31483 }
31484 return nullptr;
31485 }
31486
31489 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
31491 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
31492
31493 if (IntId == Intrinsic::not_intrinsic)
31494 return nullptr;
31495
31496 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
31497 }
31498
31499 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
31500 IsScalable) {
31501 return B.CreateIntrinsic(
31502 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
31503 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31504 }
31505
31506 return nullptr;
31507}
31508
31509bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
31510 unsigned Opc = N->getOpcode();
31511 if (ISD::isExtOpcode(Opc)) {
31512 if (any_of(N->users(),
31513 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
31514 return false;
31515 }
31516 return true;
31517}
31518
31520 return Subtarget->getMinimumJumpTableEntries();
31521}
31522
31524 CallingConv::ID CC,
31525 EVT VT) const {
31526 bool NonUnitFixedLengthVector =
31528 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31529 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
31530
31531 EVT VT1;
31532 MVT RegisterVT;
31533 unsigned NumIntermediates;
31534 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
31535 RegisterVT);
31536 return RegisterVT;
31537}
31538
31540 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
31541 bool NonUnitFixedLengthVector =
31543 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31544 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
31545
31546 EVT VT1;
31547 MVT VT2;
31548 unsigned NumIntermediates;
31549 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
31550 NumIntermediates, VT2);
31551}
31552
31554 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
31555 unsigned &NumIntermediates, MVT &RegisterVT) const {
31557 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
31558 if (!RegisterVT.isFixedLengthVector() ||
31559 RegisterVT.getFixedSizeInBits() <= 128)
31560 return NumRegs;
31561
31562 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
31563 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
31564 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
31565
31566 // A size mismatch here implies either type promotion or widening and would
31567 // have resulted in scalarisation if larger vectors had not be available.
31568 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
31569 EVT EltTy = VT.getVectorElementType();
31570 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
31571 if (!isTypeLegal(NewVT))
31572 NewVT = EltTy;
31573
31574 IntermediateVT = NewVT;
31575 NumIntermediates = VT.getVectorNumElements();
31576 RegisterVT = getRegisterType(Context, NewVT);
31577 return NumIntermediates;
31578 }
31579
31580 // SVE VLS support does not introduce a new ABI so we should use NEON sized
31581 // types for vector arguments and returns.
31582
31583 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
31584 NumIntermediates *= NumSubRegs;
31585 NumRegs *= NumSubRegs;
31586
31587 switch (RegisterVT.getVectorElementType().SimpleTy) {
31588 default:
31589 llvm_unreachable("unexpected element type for vector");
31590 case MVT::i8:
31591 IntermediateVT = RegisterVT = MVT::v16i8;
31592 break;
31593 case MVT::i16:
31594 IntermediateVT = RegisterVT = MVT::v8i16;
31595 break;
31596 case MVT::i32:
31597 IntermediateVT = RegisterVT = MVT::v4i32;
31598 break;
31599 case MVT::i64:
31600 IntermediateVT = RegisterVT = MVT::v2i64;
31601 break;
31602 case MVT::f16:
31603 IntermediateVT = RegisterVT = MVT::v8f16;
31604 break;
31605 case MVT::f32:
31606 IntermediateVT = RegisterVT = MVT::v4f32;
31607 break;
31608 case MVT::f64:
31609 IntermediateVT = RegisterVT = MVT::v2f64;
31610 break;
31611 case MVT::bf16:
31612 IntermediateVT = RegisterVT = MVT::v8bf16;
31613 break;
31614 }
31615
31616 return NumRegs;
31617}
31618
31620 const MachineFunction &MF) const {
31621 return !Subtarget->isTargetWindows() &&
31622 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
31623}
31624
31626 switch (Opc) {
31630 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
31631 return true;
31632 }
31633
31635}
31636
31638 EVT VT) const {
31639 return Subtarget->hasCPA() && UseFEATCPACodegen;
31640}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue performPTestFirstCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, const AArch64TargetLowering &TLI, const AArch64RegisterInfo &TRI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static bool isLane0KnownActive(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
@ Default
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
mir Rename Register Operands
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1166
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
an instruction to allocate memory on the stack
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ And
*p = old & v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:935
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:321
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
arg_iterator arg_end()
Definition Function.h:875
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
const Argument * const_arg_iterator
Definition Function.h:73
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1936
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2251
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool hasScalableStackID(int ObjectIdx) const
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:56
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:712
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasStreamingInterfaceOrBody() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:686
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:299
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
@ HalfTyID
16-bit floating point type
Definition Type.h:56
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:283
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ LOOP_DEPENDENCE_RAW_MASK
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:855
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:622
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:682
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:627
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:648
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:611
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:853
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:857
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
@ LOOP_DEPENDENCE_WAR_MASK
Set rounding mode.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
const unsigned VectorBits
Definition SystemZ.h:154
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
bool isPackedVectorType(EVT SomeVT)
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2100
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
static const MachineMemOperand::Flags MOStridedAccess
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:207
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64