LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
60#include "llvm/IR/Attributes.h"
61#include "llvm/IR/Constants.h"
62#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/IR/Function.h"
67#include "llvm/IR/GlobalValue.h"
68#include "llvm/IR/IRBuilder.h"
69#include "llvm/IR/Instruction.h"
72#include "llvm/IR/Intrinsics.h"
73#include "llvm/IR/IntrinsicsAArch64.h"
74#include "llvm/IR/Module.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
298 case ISD::GET_ACTIVE_LANE_MASK:
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
405 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add legal sve data types
449 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
451 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
452 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
453
454 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
456 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
457 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
460
461 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
463 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
464
465 if (Subtarget->useSVEForFixedLengthVectors()) {
468 addRegisterClass(VT, &AArch64::ZPRRegClass);
469
472 addRegisterClass(VT, &AArch64::ZPRRegClass);
473 }
474 }
475
476 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
477 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
478 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
479 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
480
481 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
482 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
483 }
484
485 // Compute derived properties from the register classes
486 computeRegisterProperties(Subtarget->getRegisterInfo());
487
488 // Provide all sorts of operation actions
506 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
507 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
508 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
509 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
510 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
511 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
514 if (Subtarget->hasFPARMv8()) {
517 }
526 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
528 setOperationAction(ISD::BRIND, MVT::Other, Custom);
530
532
536
540
542
543 // Custom lowering hooks are needed for XOR
544 // to fold it into CSINC/CSINV.
547
548 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
549 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
550
551 // Virtually no operation on f128 is legal, but LLVM can't expand them when
552 // there's a valid register class, so we need custom operations in most cases.
553 setOperationAction(ISD::FABS, MVT::f128, Expand);
556 setOperationAction(ISD::FCOS, MVT::f128, Expand);
560 setOperationAction(ISD::FNEG, MVT::f128, Expand);
561 setOperationAction(ISD::FPOW, MVT::f128, Expand);
563 setOperationAction(ISD::FRINT, MVT::f128, Expand);
564 setOperationAction(ISD::FSIN, MVT::f128, Expand);
565 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
566 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
568 setOperationAction(ISD::FTAN, MVT::f128, Expand);
569 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
573 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
576 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
577 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
578 // aren't handled.
579
580 // Lowering for many of the conversions is actually specified by the non-f128
581 // type. The LowerXXX function will be trivial when f128 isn't involved.
606 if (Subtarget->hasFPARMv8()) {
609 }
612 if (Subtarget->hasFPARMv8()) {
615 }
618
623
624 // Variable arguments.
625 setOperationAction(ISD::VASTART, MVT::Other, Custom);
626 setOperationAction(ISD::VAARG, MVT::Other, Custom);
627 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
628 setOperationAction(ISD::VAEND, MVT::Other, Expand);
629
630 // Variable-sized objects.
631 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
632 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
633
634 // Lowering Funnel Shifts to EXTR
639
640 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
641
642 // Constant pool entries
644
645 // BlockAddress
647
648 // AArch64 lacks both left-rotate and popcount instructions.
654 }
655
656 // AArch64 doesn't have i32 MULH{S|U}.
659
660 // AArch64 doesn't have {U|S}MUL_LOHI.
665
666 if (Subtarget->hasCSSC()) {
670
672
676
679
684
689 } else {
693
696
699 }
700
706 }
713
714 // Custom lower Add/Sub/Mul with overflow.
727
736
737 setOperationAction(ISD::FSIN, MVT::f32, Expand);
738 setOperationAction(ISD::FSIN, MVT::f64, Expand);
739 setOperationAction(ISD::FCOS, MVT::f32, Expand);
740 setOperationAction(ISD::FCOS, MVT::f64, Expand);
741 setOperationAction(ISD::FPOW, MVT::f32, Expand);
742 setOperationAction(ISD::FPOW, MVT::f64, Expand);
745 if (Subtarget->hasFullFP16()) {
748 } else {
751 }
752
753 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
754 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
755 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
756 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
757 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
758 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
759 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
767 setOperationAction(Op, MVT::f16, Promote);
768 setOperationAction(Op, MVT::v4f16, Expand);
769 setOperationAction(Op, MVT::v8f16, Expand);
770 setOperationAction(Op, MVT::bf16, Promote);
771 setOperationAction(Op, MVT::v4bf16, Expand);
772 setOperationAction(Op, MVT::v8bf16, Expand);
773 }
774
775 // Legalize fcanonicalize to circumvent default expansion
776 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
777 if (Subtarget->hasFullFP16()) {
779 }
780
781 // fpextend from f16 or bf16 to f32 is legal
782 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
783 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
786 // fpextend from bf16 to f64 needs to be split into two fpextends
787 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
789
790 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
791 for (auto Op : {
794 ISD::BR_CC,
795 ISD::FADD,
796 ISD::FSUB,
797 ISD::FMUL,
798 ISD::FDIV,
799 ISD::FMA,
800 ISD::FCEIL,
801 ISD::FSQRT,
802 ISD::FFLOOR,
803 ISD::FNEARBYINT,
804 ISD::FRINT,
805 ISD::FROUND,
806 ISD::FROUNDEVEN,
807 ISD::FTRUNC,
808 ISD::FMINNUM,
809 ISD::FMAXNUM,
810 ISD::FMINIMUM,
811 ISD::FMAXIMUM,
812 ISD::FMINIMUMNUM,
813 ISD::FMAXIMUMNUM,
832 })
833 setOperationAction(Op, ScalarVT, Promote);
834
835 for (auto Op : {ISD::FNEG, ISD::FABS})
836 setOperationAction(Op, ScalarVT, Legal);
837
838 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
839 // because the result type is integer.
840 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
843 setOperationAction(Op, ScalarVT, Custom);
844
845 // promote v4f16 to v4f32 when that is known to be safe.
846 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
847 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
854 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
855 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
856 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
857 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
858 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
859 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
860
861 setOperationAction(ISD::FABS, V4Narrow, Legal);
862 setOperationAction(ISD::FNEG, V4Narrow, Legal);
864 setOperationAction(ISD::BR_CC, V4Narrow, Expand);
868 setOperationAction(ISD::FSQRT, V4Narrow, Expand);
869
870 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
871 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
872 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
873
874 setOperationAction(ISD::FABS, V8Narrow, Legal);
876 setOperationAction(ISD::FCEIL, V8Narrow, Legal);
879 setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
882 setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
883 setOperationAction(ISD::FNEG, V8Narrow, Legal);
884 setOperationAction(ISD::FROUND, V8Narrow, Legal);
885 setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
886 setOperationAction(ISD::FRINT, V8Narrow, Legal);
887 setOperationAction(ISD::FSQRT, V8Narrow, Expand);
889 setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
890 setOperationAction(ISD::BR_CC, V8Narrow, Expand);
893 setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
894 };
895
896 if (!Subtarget->hasFullFP16()) {
897 LegalizeNarrowFP(MVT::f16);
898 }
899 LegalizeNarrowFP(MVT::bf16);
902
903 // AArch64 has implementations of a lot of rounding-like FP operations.
904 // clang-format off
905 for (auto Op :
906 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
907 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
908 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
909 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
910 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
911 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
917 for (MVT Ty : {MVT::f32, MVT::f64})
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, MVT::f16, Legal);
921 }
922 // clang-format on
923
924 // Basic strict FP operations are legal
927 for (MVT Ty : {MVT::f32, MVT::f64})
929 if (Subtarget->hasFullFP16())
930 setOperationAction(Op, MVT::f16, Legal);
931 }
932
933 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
934
936 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
937 setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
938 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
939 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
940
941 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
942 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
943 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
944 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
945 } else {
946 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
947 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
948 }
949 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
950 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
951
952 // Generate outline atomics library calls only if LSE was not specified for
953 // subtarget
954 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
955 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
956 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
957 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
958 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
959 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
960 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
961 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
962 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
963 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
964 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
965 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
966 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
967 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
968 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
969 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
970 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
971 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
972 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
973 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
974 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
975 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
976 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
977 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
978 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
979 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
980 }
981
982 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
983 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);
984 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);
985 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);
986 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);
987
988 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);
989 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);
990 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);
991 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);
992
993 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);
994 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);
995 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);
996 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);
997
998 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);
999 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);
1000 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);
1001 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);
1002
1003 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);
1004 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);
1005 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);
1006 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);
1007 }
1008
1009 if (Subtarget->hasLSE128()) {
1010 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1011 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1012 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
1013 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
1014 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
1015 }
1016
1017 // 128-bit loads and stores can be done without expanding
1018 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1019 setOperationAction(ISD::STORE, MVT::i128, Custom);
1020
1021 // Aligned 128-bit loads and stores are single-copy atomic according to the
1022 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1023 if (Subtarget->hasLSE2()) {
1024 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1025 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1026 }
1027
1028 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1029 // custom lowering, as there are no un-paired non-temporal stores and
1030 // legalization will break up 256 bit inputs.
1031 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1035 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1036 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1037 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1038 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1039
1040 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1041 // custom lowering, as there are no un-paired non-temporal loads legalization
1042 // will break up 256 bit inputs.
1043 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1045 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1046 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1047 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1048 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1049 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1050 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1051
1052 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1053 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1054
1055 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057 // Issue __sincos_stret if available.
1058 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1059 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1060 } else {
1061 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1062 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1063 }
1064
1065 // Make floating-point constants legal for the large code model, so they don't
1066 // become loads from the constant pool.
1067 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1070 }
1071
1072 // AArch64 does not have floating-point extending loads, i1 sign-extending
1073 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1074 for (MVT VT : MVT::fp_valuetypes()) {
1075 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1076 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1077 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1078 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1079 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1080 }
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1083
1084 for (MVT WideVT : MVT::fp_valuetypes()) {
1085 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1086 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1087 setTruncStoreAction(WideVT, NarrowVT, Expand);
1088 }
1089 }
1090 }
1091
1092 if (Subtarget->hasFPARMv8()) {
1093 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1094 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1095 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1096 }
1097
1098 // Indexed loads and stores are supported.
1099 for (unsigned im = (unsigned)ISD::PRE_INC;
1101 setIndexedLoadAction(im, MVT::i8, Legal);
1102 setIndexedLoadAction(im, MVT::i16, Legal);
1103 setIndexedLoadAction(im, MVT::i32, Legal);
1104 setIndexedLoadAction(im, MVT::i64, Legal);
1105 setIndexedLoadAction(im, MVT::f64, Legal);
1106 setIndexedLoadAction(im, MVT::f32, Legal);
1107 setIndexedLoadAction(im, MVT::f16, Legal);
1108 setIndexedLoadAction(im, MVT::bf16, Legal);
1109 setIndexedStoreAction(im, MVT::i8, Legal);
1110 setIndexedStoreAction(im, MVT::i16, Legal);
1111 setIndexedStoreAction(im, MVT::i32, Legal);
1112 setIndexedStoreAction(im, MVT::i64, Legal);
1113 setIndexedStoreAction(im, MVT::f64, Legal);
1114 setIndexedStoreAction(im, MVT::f32, Legal);
1115 setIndexedStoreAction(im, MVT::f16, Legal);
1116 setIndexedStoreAction(im, MVT::bf16, Legal);
1117 }
1118
1119 // Trap.
1120 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1121 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1122 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1123
1124 // We combine OR nodes for ccmp operations.
1126 // Try to create BICs for vector ANDs.
1128
1129 // llvm.init.trampoline and llvm.adjust.trampoline
1130 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1131 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1132
1133 // Vector add and sub nodes may conceal a high-half opportunity.
1134 // Also, try to fold ADD into CSINC/CSINV..
1137
1140
1141 // Try and combine setcc with csel
1143
1145
1149 ISD::STORE, ISD::BUILD_VECTOR});
1152 setTargetDAGCombine(ISD::LOAD);
1153
1154 setTargetDAGCombine(ISD::MSTORE);
1155
1157
1159
1162 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1163
1165 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1166
1167 setTargetDAGCombine(ISD::FP_EXTEND);
1168
1170
1172
1173 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1174
1175 setTargetDAGCombine(ISD::VECREDUCE_AND);
1176 setTargetDAGCombine(ISD::VECREDUCE_OR);
1177 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1178
1180
1183
1184 // In case of strict alignment, avoid an excessive number of byte wide stores.
1187 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1188
1192 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1193
1196 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1197
1200 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1201
1203
1205
1206 EnableExtLdPromotion = true;
1207
1208 // Set required alignment.
1210 // Set preferred alignments.
1211
1212 // Don't align loops on Windows. The SEH unwind info generation needs to
1213 // know the exact length of functions before the alignments have been
1214 // expanded.
1215 if (!Subtarget->isTargetWindows())
1219
1220 // Only change the limit for entries in a jump table if specified by
1221 // the sub target, but not at the command line.
1222 unsigned MaxJT = STI.getMaximumJumpTableSize();
1223 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1225
1227
1229
1231 if (Subtarget->hasSME())
1233
1234 if (Subtarget->isNeonAvailable()) {
1235 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1236 // silliness like this:
1237 // clang-format off
1238 for (auto Op :
1239 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1240 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1242 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1243 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1244 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1245 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1246 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1247 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1248 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1249 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1250 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1251 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1252 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1259 setOperationAction(Op, MVT::v1f64, Expand);
1260 // clang-format on
1261
1262 for (auto Op :
1267 setOperationAction(Op, MVT::v1i64, Expand);
1268
1269 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1270 // elements smaller than i32, so promote the input to i32 first.
1271 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1272 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1273
1274 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1275 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1276 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1279 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1281
1282 if (Subtarget->hasFullFP16()) {
1285
1294 } else {
1295 // when AArch64 doesn't have fullfp16 support, promote the input
1296 // to i32 first.
1297 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1298 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1299 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1300 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1301 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1302 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1303 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1304 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1305 }
1306
1307 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1308 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1315 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1320 }
1321
1322 // Custom handling for some quad-vector types to detect MULL.
1323 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1324 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1325 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1326 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1327 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1328 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1329
1330 // Saturates
1331 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1332 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1337 }
1338
1339 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1340 MVT::v4i32}) {
1347 }
1348
1349 // Vector reductions
1350 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1351 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1352 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1353 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1354 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1355 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1356 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1357
1358 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1359 }
1360 }
1361 if (Subtarget->hasFullFP16())
1362 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
1363
1364 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1365 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1366 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1367 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1368 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1369 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1370 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1371 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1372 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1373 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1374 }
1375 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1376 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1377 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1378 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1379
1381 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1382 // Likewise, narrowing and extending vector loads/stores aren't handled
1383 // directly.
1386
1387 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1390 } else {
1393 }
1396
1399
1400 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1401 setTruncStoreAction(VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1403 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1404 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1405 }
1406 }
1407
1408 for (auto Op :
1409 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1410 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1414 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1416 if (Subtarget->hasFullFP16())
1417 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1419 }
1420
1421 // LRINT and LLRINT.
1422 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1423 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1425 if (Subtarget->hasFullFP16())
1426 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1428 }
1429
1430 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1431
1432 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1433 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1434 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1435 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1436
1437 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1438 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1439 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1440
1441 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1442 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1443 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1444 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1445 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1446 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1447
1448 // ADDP custom lowering
1449 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1451 // FADDP custom lowering
1452 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1454
1455 if (Subtarget->hasDotProd()) {
1456 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457 ISD::PARTIAL_REDUCE_UMLA};
1458
1459 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1461 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1462
1463 if (Subtarget->hasMatMulInt8()) {
1464 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,
1465 MVT::v16i8, Legal);
1466 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,
1467 MVT::v16i8, Custom);
1468
1469 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,
1470 MVT::v8i8, Legal);
1471 }
1472 }
1473
1474 } else /* !isNeonAvailable */ {
1476 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1478
1479 if (VT.is128BitVector() || VT.is64BitVector()) {
1480 setOperationAction(ISD::LOAD, VT, Legal);
1481 setOperationAction(ISD::STORE, VT, Legal);
1482 setOperationAction(ISD::BITCAST, VT,
1483 Subtarget->isLittleEndian() ? Legal : Expand);
1484 }
1485 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1486 setTruncStoreAction(VT, InnerVT, Expand);
1487 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1489 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1490 }
1491 }
1492 }
1493
1494 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1498 }
1499
1500 if (Subtarget->hasSME()) {
1502 }
1503
1504 // FIXME: Move lowering for more nodes here if those are common between
1505 // SVE and SME.
1506 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1507 for (auto VT :
1508 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1513 }
1514 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1515 setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
1516 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
1517 }
1518
1519 if (Subtarget->hasSVE2p1() ||
1520 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1521 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);
1522
1523 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1524 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);
1525 }
1526
1527 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1528 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1539 setOperationAction(ISD::MLOAD, VT, Custom);
1540 setOperationAction(ISD::MSTORE, VT, Legal);
1560 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1561 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1562 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1563 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1564 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1565 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1566 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1567 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1570
1576
1585
1590
1591 if (!Subtarget->isLittleEndian())
1592 setOperationAction(ISD::BITCAST, VT, Custom);
1593
1594 if (Subtarget->hasSVE2() ||
1595 (Subtarget->hasSME() && Subtarget->isStreaming()))
1596 // For SLI/SRI.
1598 }
1599
1600 // Illegal unpacked integer vector types.
1601 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1604 }
1605
1606 // Type legalize unpacked bitcasts.
1607 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1608 setOperationAction(ISD::BITCAST, VT, Custom);
1609
1610 for (auto VT :
1611 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1612 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1614
1615 for (auto VT :
1616 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1621 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1622 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1623 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1624
1628
1629 // There are no legal MVT::nxv16f## based types.
1630 if (VT != MVT::nxv16i1) {
1635 }
1636 }
1637
1638 // NEON doesn't support masked loads/stores, but SME and SVE do.
1639 for (auto VT :
1640 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1641 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1642 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1643 setOperationAction(ISD::MLOAD, VT, Custom);
1644 setOperationAction(ISD::MSTORE, VT, Custom);
1645 }
1646
1647 // Firstly, exclude all scalable vector extending loads/truncating stores,
1648 // include both integer and floating scalable vector.
1650 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1651 setTruncStoreAction(VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1653 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1654 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1655 }
1656 }
1657
1658 // Then, selectively enable those which we directly support.
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1660 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1661 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1662 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1663 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1664 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1665 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1667 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1668 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1669 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1670 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1671 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1672 }
1673
1674 // SVE supports truncating stores of 64 and 128-bit vectors
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1676 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1677 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1678 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1679 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1680
1681 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1682 MVT::nxv4f32, MVT::nxv2f64}) {
1683 setOperationAction(ISD::BITCAST, VT, Custom);
1686 setOperationAction(ISD::MLOAD, VT, Custom);
1694 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1695 setOperationAction(ISD::FMAXNUM, VT, Custom);
1696 setOperationAction(ISD::FMINIMUM, VT, Custom);
1697 setOperationAction(ISD::FMINNUM, VT, Custom);
1699 setOperationAction(ISD::FNEG, VT, Custom);
1701 setOperationAction(ISD::FCEIL, VT, Custom);
1702 setOperationAction(ISD::FFLOOR, VT, Custom);
1703 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1704 setOperationAction(ISD::FRINT, VT, Custom);
1705 setOperationAction(ISD::LRINT, VT, Custom);
1706 setOperationAction(ISD::LLRINT, VT, Custom);
1707 setOperationAction(ISD::FROUND, VT, Custom);
1708 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1709 setOperationAction(ISD::FTRUNC, VT, Custom);
1710 setOperationAction(ISD::FSQRT, VT, Custom);
1711 setOperationAction(ISD::FABS, VT, Custom);
1712 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1714 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1715 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1716 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1717 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1718 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1722
1725 setOperationAction(ISD::FPOW, VT, Expand);
1726 setOperationAction(ISD::FPOWI, VT, Expand);
1727 setOperationAction(ISD::FCOS, VT, Expand);
1728 setOperationAction(ISD::FSIN, VT, Expand);
1729 setOperationAction(ISD::FSINCOS, VT, Expand);
1730 setOperationAction(ISD::FTAN, VT, Expand);
1731 setOperationAction(ISD::FACOS, VT, Expand);
1732 setOperationAction(ISD::FASIN, VT, Expand);
1733 setOperationAction(ISD::FATAN, VT, Expand);
1734 setOperationAction(ISD::FATAN2, VT, Expand);
1735 setOperationAction(ISD::FCOSH, VT, Expand);
1736 setOperationAction(ISD::FSINH, VT, Expand);
1737 setOperationAction(ISD::FTANH, VT, Expand);
1738 setOperationAction(ISD::FEXP, VT, Expand);
1739 setOperationAction(ISD::FEXP2, VT, Expand);
1740 setOperationAction(ISD::FEXP10, VT, Expand);
1741 setOperationAction(ISD::FLOG, VT, Expand);
1742 setOperationAction(ISD::FLOG2, VT, Expand);
1743 setOperationAction(ISD::FLOG10, VT, Expand);
1744
1756 }
1757
1758 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1759 setOperationAction(ISD::BITCAST, VT, Custom);
1761 setOperationAction(ISD::FABS, VT, Custom);
1763 setOperationAction(ISD::FNEG, VT, Custom);
1764 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1766 setOperationAction(ISD::MLOAD, VT, Custom);
1774
1775 if (Subtarget->hasSVEB16B16() &&
1776 Subtarget->isNonStreamingSVEorSME2Available()) {
1779 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1780 setOperationAction(ISD::FMAXNUM, VT, Custom);
1781 setOperationAction(ISD::FMINIMUM, VT, Custom);
1782 setOperationAction(ISD::FMINNUM, VT, Custom);
1785 }
1786 }
1787
1788 for (auto Opcode :
1789 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1790 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1791 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1792 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1793 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1794 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1795 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1796 }
1797
1798 if (!Subtarget->hasSVEB16B16() ||
1799 !Subtarget->isNonStreamingSVEorSME2Available()) {
1800 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1801 ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1802 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1803 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1804 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1805 }
1806 }
1807
1810
1811 // NEON doesn't support integer divides, but SVE does
1812 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1813 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1816 }
1817
1818 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1819 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1820 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1821
1822 // NOTE: Currently this has to happen after computeRegisterProperties rather
1823 // than the preferred option of combining it with the addRegisterClass call.
1824 if (Subtarget->useSVEForFixedLengthVectors()) {
1827 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1828 addTypeForFixedLengthSVE(VT);
1829 }
1832 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1833 addTypeForFixedLengthSVE(VT);
1834 }
1835
1836 // 64bit results can mean a bigger than NEON input.
1837 for (auto VT : {MVT::v8i8, MVT::v4i16})
1840
1841 // 128bit results imply a bigger than NEON input.
1842 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1844 for (auto VT : {MVT::v8f16, MVT::v4f32})
1846
1847 // These operations are not supported on NEON but SVE can do them.
1849 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1850 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1851 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1852 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1853 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1854 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1855 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1856 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1857 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1858 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1859 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1860 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1861 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1862 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1863 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1864 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1865 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1866 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1867 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1868
1869 // Int operations with no NEON support.
1870 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1871 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1874 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1875 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1876 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1879 }
1880
1881 // Use SVE for vectors with more than 2 elements.
1882 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1883 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1884 }
1885
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1888 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1889 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1890
1891 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1892
1893 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1895 }
1896
1897 // Handle partial reduction operations
1898 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1899 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1900 // Other pairs will default to 'Expand'.
1901 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1902 ISD::PARTIAL_REDUCE_UMLA};
1903 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1904 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1905
1906 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1907
1908 if (Subtarget->hasMatMulInt8()) {
1909 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,
1910 MVT::nxv16i8, Legal);
1911 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,
1912 MVT::nxv16i8, Custom);
1913 }
1914
1915 // Wide add types
1916 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1918 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1919 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1920 }
1921 }
1922
1923 // Handle non-aliasing elements mask
1924 if (Subtarget->hasSVE2() ||
1925 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1926 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1927 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1930 }
1931 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1934 }
1935 }
1936
1937 // Handle operations that are only available in non-streaming SVE mode.
1938 if (Subtarget->isSVEAvailable()) {
1939 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1940 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1941 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1942 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1943 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1944 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1945 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1946 setOperationAction(ISD::MGATHER, VT, Custom);
1947 setOperationAction(ISD::MSCATTER, VT, Custom);
1948 }
1949
1950 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1951 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1952 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1953 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1954
1955 // We can lower types that have <vscale x {2|4}> elements to compact.
1956 for (auto VT :
1957 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1958 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1960
1961 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1962 // NEON vectors in the lowest bits of the SVE register.
1963 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1964 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1966
1967 // Histcnt is SVE2 only
1968 if (Subtarget->hasSVE2()) {
1969 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
1970 Custom);
1971 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
1972 Custom);
1973
1974 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1975 ISD::PARTIAL_REDUCE_UMLA};
1976 // Must be lowered to SVE instructions.
1977 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1978 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1979 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1980 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1981 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1982 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1983 }
1984 }
1985
1986 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1987 // Only required for llvm.aarch64.mops.memset.tag
1989 }
1990
1992
1993 if (Subtarget->hasSVE()) {
1994 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1995 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1996 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1997 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1998 }
1999
2000 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2001
2002 IsStrictFPEnabled = true;
2004
2005 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2006 // it, but it's just a wrapper around ldexp.
2007 if (Subtarget->isTargetWindows()) {
2008 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2009 if (isOperationExpand(Op, MVT::f32))
2010 setOperationAction(Op, MVT::f32, Promote);
2011 }
2012
2013 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2014 // isn't legal.
2015 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2016 if (isOperationExpand(Op, MVT::f16))
2017 setOperationAction(Op, MVT::f16, Promote);
2018}
2019
2021 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2022}
2023
2024void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2025 assert(VT.isVector() && "VT should be a vector type");
2026
2027 if (VT.isFloatingPoint()) {
2029 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2030 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2031 }
2032
2033 // Mark vector float intrinsics as expand.
2034 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2035 setOperationAction(ISD::FSIN, VT, Expand);
2036 setOperationAction(ISD::FCOS, VT, Expand);
2037 setOperationAction(ISD::FTAN, VT, Expand);
2038 setOperationAction(ISD::FASIN, VT, Expand);
2039 setOperationAction(ISD::FACOS, VT, Expand);
2040 setOperationAction(ISD::FATAN, VT, Expand);
2041 setOperationAction(ISD::FATAN2, VT, Expand);
2042 setOperationAction(ISD::FSINH, VT, Expand);
2043 setOperationAction(ISD::FCOSH, VT, Expand);
2044 setOperationAction(ISD::FTANH, VT, Expand);
2045 setOperationAction(ISD::FPOW, VT, Expand);
2046 setOperationAction(ISD::FLOG, VT, Expand);
2047 setOperationAction(ISD::FLOG2, VT, Expand);
2048 setOperationAction(ISD::FLOG10, VT, Expand);
2049 setOperationAction(ISD::FEXP, VT, Expand);
2050 setOperationAction(ISD::FEXP2, VT, Expand);
2051 setOperationAction(ISD::FEXP10, VT, Expand);
2052 }
2053
2054 // But we do support custom-lowering for FCOPYSIGN.
2055 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2056 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2057 VT == MVT::v8f16) &&
2058 Subtarget->hasFullFP16()))
2060
2073
2077 for (MVT InnerVT : MVT::all_valuetypes())
2078 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2079
2080 // CNT supports only B element sizes, then use UADDLP to widen.
2081 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2083
2089
2090 for (unsigned Opcode :
2093 setOperationAction(Opcode, VT, Custom);
2094
2095 if (!VT.isFloatingPoint())
2097
2098 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2099 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2100 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2101 setOperationAction(Opcode, VT, Legal);
2102
2103 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2104 // NEON types.
2105 if (VT.isFloatingPoint() &&
2106 VT.getVectorElementType() != MVT::bf16 &&
2107 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2108 for (unsigned Opcode :
2109 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2110 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2114 setOperationAction(Opcode, VT, Legal);
2115
2116 // Strict fp extend and trunc are legal
2117 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2119 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2121
2122 // FIXME: We could potentially make use of the vector comparison instructions
2123 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2124 // complications:
2125 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2126 // so we would need to expand when the condition code doesn't match the
2127 // kind of comparison.
2128 // * Some kinds of comparison require more than one FCMXY instruction so
2129 // would need to be expanded instead.
2130 // * The lowering of the non-strict versions involves target-specific ISD
2131 // nodes so we would likely need to add strict versions of all of them and
2132 // handle them appropriately.
2135
2136 // When little-endian we can use ordinary d and q register loads/stores for
2137 // vector types, but when big-endian we need to use structure load/store which
2138 // only allow post-index addressing.
2139 if (Subtarget->isLittleEndian()) {
2140 for (unsigned im = (unsigned)ISD::PRE_INC;
2141 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2144 }
2145 } else {
2148 }
2149
2150 if (Subtarget->hasD128()) {
2153 }
2154
2155 if (VT.isInteger()) {
2156 // Let common code emit inverted variants of compares we do support.
2162 }
2163}
2164
2166 EVT OpVT) const {
2167 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2168 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2169 ResVT.getVectorElementType() != MVT::i1)
2170 return true;
2171
2172 // Only support illegal types if the result is scalable and min elements > 1.
2173 if (ResVT.getVectorMinNumElements() == 1 ||
2174 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2175 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2176 return true;
2177
2178 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2179 // but anything larger should be expanded.
2180 if (OpVT.getFixedSizeInBits() > 64)
2181 return true;
2182
2183 return false;
2184}
2185
2187 if (!Subtarget->isSVEorStreamingSVEAvailable())
2188 return true;
2189
2190 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2191 // also support fixed-width predicates.
2192 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2193 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2194 VT != MVT::v4i1 && VT != MVT::v2i1;
2195}
2196
2198 unsigned SearchSize) const {
2199 // MATCH is SVE2 and only available in non-streaming mode.
2200 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2201 return true;
2202 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2203 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2204 return SearchSize != 8;
2205 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2206 return SearchSize != 8 && SearchSize != 16;
2207 return true;
2208}
2209
2210void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2211 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2212
2213 // By default everything must be expanded.
2214 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2216
2217 if (VT.isFloatingPoint()) {
2227 }
2228
2230 VT == MVT::v1f64 ? Expand : Custom;
2231
2232 // Mark integer truncating stores/extending loads as having custom lowering
2233 if (VT.isInteger()) {
2234 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2235 while (InnerVT != VT) {
2236 setTruncStoreAction(VT, InnerVT, Default);
2237 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2238 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2239 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2240 InnerVT = InnerVT.changeVectorElementType(
2241 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2242 }
2243 }
2244
2245 // Mark floating-point truncating stores/extending loads as having custom
2246 // lowering
2247 if (VT.isFloatingPoint()) {
2248 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2249 while (InnerVT != VT) {
2250 setTruncStoreAction(VT, InnerVT, Custom);
2251 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2252 InnerVT = InnerVT.changeVectorElementType(
2254 }
2255 }
2256
2257 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2258 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2259
2260 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2261 ISD::PARTIAL_REDUCE_UMLA};
2262 unsigned NumElts = VT.getVectorNumElements();
2263 if (VT.getVectorElementType() == MVT::i64) {
2264 setPartialReduceMLAAction(MLAOps, VT,
2265 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2266 setPartialReduceMLAAction(MLAOps, VT,
2267 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2268 setPartialReduceMLAAction(MLAOps, VT,
2269 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2270 } else if (VT.getVectorElementType() == MVT::i32) {
2271 setPartialReduceMLAAction(MLAOps, VT,
2272 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2273 setPartialReduceMLAAction(MLAOps, VT,
2274 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2275 } else if (VT.getVectorElementType() == MVT::i16) {
2276 setPartialReduceMLAAction(MLAOps, VT,
2277 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2278 }
2279 if (Subtarget->hasMatMulInt8()) {
2280 if (VT.getVectorElementType() == MVT::i32)
2281 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2282 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2283 else if (VT.getVectorElementType() == MVT::i64)
2284 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2285 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2286 }
2287
2288 // Lower fixed length vector operations to scalable equivalents.
2295 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2305 setOperationAction(ISD::FABS, VT, Default);
2307 setOperationAction(ISD::FCEIL, VT, Default);
2310 setOperationAction(ISD::FFLOOR, VT, Default);
2312 setOperationAction(ISD::FMAXIMUM, VT, Default);
2313 setOperationAction(ISD::FMAXNUM, VT, Default);
2314 setOperationAction(ISD::FMINIMUM, VT, Default);
2315 setOperationAction(ISD::FMINNUM, VT, Default);
2317 setOperationAction(ISD::FNEARBYINT, VT, Default);
2318 setOperationAction(ISD::FNEG, VT, Default);
2319 setOperationAction(ISD::FP_EXTEND, VT, Default);
2323 setOperationAction(ISD::FRINT, VT, Default);
2324 setOperationAction(ISD::LRINT, VT, Default);
2325 setOperationAction(ISD::LLRINT, VT, Default);
2326 setOperationAction(ISD::FROUND, VT, Default);
2327 setOperationAction(ISD::FROUNDEVEN, VT, Default);
2328 setOperationAction(ISD::FSQRT, VT, Default);
2330 setOperationAction(ISD::FTRUNC, VT, Default);
2331 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);
2333 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2334 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2335 setOperationAction(ISD::MLOAD, VT, Default);
2336 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2337 setOperationAction(ISD::MSTORE, VT, Default);
2355 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2362 setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2363 setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2364 setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2365 setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2366 setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2367 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2368 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2369 setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2370 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2371 setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2372 setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2373 setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2374 setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2375 setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2381}
2382
2383void AArch64TargetLowering::addDRType(MVT VT) {
2384 addRegisterClass(VT, &AArch64::FPR64RegClass);
2385 if (Subtarget->isNeonAvailable())
2386 addTypeForNEON(VT);
2387}
2388
2389void AArch64TargetLowering::addQRType(MVT VT) {
2390 addRegisterClass(VT, &AArch64::FPR128RegClass);
2391 if (Subtarget->isNeonAvailable())
2392 addTypeForNEON(VT);
2393}
2394
2396 LLVMContext &C, EVT VT) const {
2397 if (!VT.isVector())
2398 return MVT::i32;
2399 if (VT.isScalableVector())
2400 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2402}
2403
2404// isIntImmediate - This method tests to see if the node is a constant
2405// operand. If so Imm will receive the value.
2406static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2408 Imm = C->getZExtValue();
2409 return true;
2410 }
2411 return false;
2412}
2413
2414bool isVectorizedBinOp(unsigned Opcode) {
2415 switch (Opcode) {
2416 case AArch64ISD::SQDMULH:
2417 return true;
2418 default:
2419 return false;
2420 }
2421}
2422
2423// isOpcWithIntImmediate - This method tests to see if the node is a specific
2424// opcode and that it has a immediate integer right operand.
2425// If so Imm will receive the value.
2426static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2427 uint64_t &Imm) {
2428 return N->getOpcode() == Opc &&
2429 isIntImmediate(N->getOperand(1).getNode(), Imm);
2430}
2431
2432static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2433 const APInt &Demanded,
2435 unsigned NewOpc) {
2436 uint64_t OldImm = Imm, NewImm, Enc;
2437 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2438
2439 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2440 // bimm64.
2441 if (Imm == 0 || Imm == Mask ||
2443 return false;
2444
2445 unsigned EltSize = Size;
2446 uint64_t DemandedBits = Demanded.getZExtValue();
2447
2448 // Clear bits that are not demanded.
2449 Imm &= DemandedBits;
2450
2451 while (true) {
2452 // The goal here is to set the non-demanded bits in a way that minimizes
2453 // the number of switching between 0 and 1. In order to achieve this goal,
2454 // we set the non-demanded bits to the value of the preceding demanded bits.
2455 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2456 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2457 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2458 // The final result is 0b11000011.
2459 uint64_t NonDemandedBits = ~DemandedBits;
2460 uint64_t InvertedImm = ~Imm & DemandedBits;
2461 uint64_t RotatedImm =
2462 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2463 NonDemandedBits;
2464 uint64_t Sum = RotatedImm + NonDemandedBits;
2465 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2466 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2467 NewImm = (Imm | Ones) & Mask;
2468
2469 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2470 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2471 // we halve the element size and continue the search.
2472 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2473 break;
2474
2475 // We cannot shrink the element size any further if it is 2-bits.
2476 if (EltSize == 2)
2477 return false;
2478
2479 EltSize /= 2;
2480 Mask >>= EltSize;
2481 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2482
2483 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2484 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2485 return false;
2486
2487 // Merge the upper and lower halves of Imm and DemandedBits.
2488 Imm |= Hi;
2489 DemandedBits |= DemandedBitsHi;
2490 }
2491
2492 ++NumOptimizedImms;
2493
2494 // Replicate the element across the register width.
2495 while (EltSize < Size) {
2496 NewImm |= NewImm << EltSize;
2497 EltSize *= 2;
2498 }
2499
2500 (void)OldImm;
2501 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2502 "demanded bits should never be altered");
2503 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2504
2505 // Create the new constant immediate node.
2506 EVT VT = Op.getValueType();
2507 SDLoc DL(Op);
2508 SDValue New;
2509
2510 // If the new constant immediate is all-zeros or all-ones, let the target
2511 // independent DAG combine optimize this node.
2512 if (NewImm == 0 || NewImm == OrigMask) {
2513 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2514 TLO.DAG.getConstant(NewImm, DL, VT));
2515 // Otherwise, create a machine node so that target independent DAG combine
2516 // doesn't undo this optimization.
2517 } else {
2519 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2520 New = SDValue(
2521 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2522 }
2523
2524 return TLO.CombineTo(Op, New);
2525}
2526
2528 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2529 TargetLoweringOpt &TLO) const {
2530 // Delay this optimization to as late as possible.
2531 if (!TLO.LegalOps)
2532 return false;
2533
2535 return false;
2536
2537 EVT VT = Op.getValueType();
2538 if (VT.isVector())
2539 return false;
2540
2541 unsigned Size = VT.getSizeInBits();
2542
2543 if (Size != 32 && Size != 64)
2544 return false;
2545
2546 // Exit early if we demand all bits.
2547 if (DemandedBits.popcount() == Size)
2548 return false;
2549
2550 unsigned NewOpc;
2551 switch (Op.getOpcode()) {
2552 default:
2553 return false;
2554 case ISD::AND:
2555 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2556 break;
2557 case ISD::OR:
2558 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2559 break;
2560 case ISD::XOR:
2561 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2562 break;
2563 }
2564 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2565 if (!C)
2566 return false;
2567 uint64_t Imm = C->getZExtValue();
2568 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2569}
2570
2571/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2572/// Mask are known to be either zero or one and return them Known.
2574 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2575 const SelectionDAG &DAG, unsigned Depth) const {
2576 switch (Op.getOpcode()) {
2577 default:
2578 break;
2579 case AArch64ISD::DUP: {
2580 SDValue SrcOp = Op.getOperand(0);
2581 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2582 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2583 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2584 "Expected DUP implicit truncation");
2585 Known = Known.trunc(Op.getScalarValueSizeInBits());
2586 }
2587 break;
2588 }
2589 case AArch64ISD::CSEL: {
2590 KnownBits Known2;
2591 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2592 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2593 Known = Known.intersectWith(Known2);
2594 break;
2595 }
2596 case AArch64ISD::CSNEG:
2597 case AArch64ISD::CSINC:
2598 case AArch64ISD::CSINV: {
2599 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2600 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2601
2602 // The result is either:
2603 // CSINC: KnownOp0 or KnownOp1 + 1
2604 // CSINV: KnownOp0 or ~KnownOp1
2605 // CSNEG: KnownOp0 or KnownOp1 * -1
2606 if (Op.getOpcode() == AArch64ISD::CSINC)
2607 KnownOp1 = KnownBits::add(
2608 KnownOp1,
2609 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2610 else if (Op.getOpcode() == AArch64ISD::CSINV)
2611 std::swap(KnownOp1.Zero, KnownOp1.One);
2612 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2613 KnownOp1 =
2615 Op.getScalarValueSizeInBits())));
2616
2617 Known = KnownOp0.intersectWith(KnownOp1);
2618 break;
2619 }
2620 case AArch64ISD::BICi: {
2621 // Compute the bit cleared value.
2622 APInt Mask =
2623 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2624 .trunc(Known.getBitWidth());
2625 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2626 Known &= KnownBits::makeConstant(Mask);
2627 break;
2628 }
2629 case AArch64ISD::VLSHR: {
2630 KnownBits Known2;
2631 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2632 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2633 Known = KnownBits::lshr(Known, Known2);
2634 break;
2635 }
2636 case AArch64ISD::VASHR: {
2637 KnownBits Known2;
2638 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2639 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2640 Known = KnownBits::ashr(Known, Known2);
2641 break;
2642 }
2643 case AArch64ISD::VSHL: {
2644 KnownBits Known2;
2645 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2646 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2647 Known = KnownBits::shl(Known, Known2);
2648 break;
2649 }
2650 case AArch64ISD::MOVI: {
2652 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2653 break;
2654 }
2655 case AArch64ISD::MOVIshift: {
2657 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2658 << Op->getConstantOperandVal(1)));
2659 break;
2660 }
2661 case AArch64ISD::MOVImsl: {
2662 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2664 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2665 break;
2666 }
2667 case AArch64ISD::MOVIedit: {
2669 Known.getBitWidth(),
2670 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2671 break;
2672 }
2673 case AArch64ISD::MVNIshift: {
2675 APInt(Known.getBitWidth(),
2676 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2677 /*isSigned*/ false, /*implicitTrunc*/ true));
2678 break;
2679 }
2680 case AArch64ISD::MVNImsl: {
2681 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2683 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2684 /*isSigned*/ false, /*implicitTrunc*/ true));
2685 break;
2686 }
2687 case AArch64ISD::LOADgot:
2688 case AArch64ISD::ADDlow: {
2689 if (!Subtarget->isTargetILP32())
2690 break;
2691 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2692 Known.Zero = APInt::getHighBitsSet(64, 32);
2693 break;
2694 }
2695 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2696 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2697 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2698 break;
2699 }
2701 Intrinsic::ID IntID =
2702 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2703 switch (IntID) {
2704 default: return;
2705 case Intrinsic::aarch64_ldaxr:
2706 case Intrinsic::aarch64_ldxr: {
2707 unsigned BitWidth = Known.getBitWidth();
2708 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2709 unsigned MemBits = VT.getScalarSizeInBits();
2710 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2711 return;
2712 }
2713 }
2714 break;
2715 }
2717 case ISD::INTRINSIC_VOID: {
2718 unsigned IntNo = Op.getConstantOperandVal(0);
2719 switch (IntNo) {
2720 default:
2721 break;
2722 case Intrinsic::aarch64_neon_uaddlv: {
2723 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2724 unsigned BitWidth = Known.getBitWidth();
2725 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2726 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2727 assert(BitWidth >= Bound && "Unexpected width!");
2729 Known.Zero |= Mask;
2730 }
2731 break;
2732 }
2733 case Intrinsic::aarch64_neon_umaxv:
2734 case Intrinsic::aarch64_neon_uminv: {
2735 // Figure out the datatype of the vector operand. The UMINV instruction
2736 // will zero extend the result, so we can mark as known zero all the
2737 // bits larger than the element datatype. 32-bit or larget doesn't need
2738 // this as those are legal types and will be handled by isel directly.
2739 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2740 unsigned BitWidth = Known.getBitWidth();
2741 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2742 assert(BitWidth >= 8 && "Unexpected width!");
2744 Known.Zero |= Mask;
2745 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2746 assert(BitWidth >= 16 && "Unexpected width!");
2748 Known.Zero |= Mask;
2749 }
2750 break;
2751 } break;
2752 }
2753 }
2754 }
2755}
2756
2758 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2759 unsigned Depth) const {
2760 EVT VT = Op.getValueType();
2761 unsigned VTBits = VT.getScalarSizeInBits();
2762 unsigned Opcode = Op.getOpcode();
2763 switch (Opcode) {
2764 case AArch64ISD::FCMEQ:
2765 case AArch64ISD::FCMGE:
2766 case AArch64ISD::FCMGT:
2767 // Compares return either 0 or all-ones
2768 return VTBits;
2769 case AArch64ISD::VASHR: {
2770 unsigned Tmp =
2771 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2772 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2773 }
2774 }
2775
2776 return 1;
2777}
2778
2780 EVT) const {
2781 return MVT::i64;
2782}
2783
2785 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2786 unsigned *Fast) const {
2787
2788 // Allow SVE loads/stores where the alignment >= the size of the element type,
2789 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2790 // for stores that come from IR, only require element-size alignment (even if
2791 // unaligned accesses are disabled). Without this, these will be forced to
2792 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2793 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2794 if (VT.isScalableVector()) {
2795 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2796 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2797 return true;
2798 }
2799
2800 if (Subtarget->requiresStrictAlign())
2801 return false;
2802
2803 if (Fast) {
2804 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2805 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2806 // See comments in performSTORECombine() for more details about
2807 // these conditions.
2808
2809 // Code that uses clang vector extensions can mark that it
2810 // wants unaligned accesses to be treated as fast by
2811 // underspecifying alignment to be 1 or 2.
2812 Alignment <= 2 ||
2813
2814 // Disregard v2i64. Memcpy lowering produces those and splitting
2815 // them regresses performance on micro-benchmarks and olden/bh.
2816 VT == MVT::v2i64;
2817 }
2818 return true;
2819}
2820
2821// Same as above but handling LLTs instead.
2823 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2824 unsigned *Fast) const {
2825 if (Subtarget->requiresStrictAlign())
2826 return false;
2827
2828 if (Fast) {
2829 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2830 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2831 Ty.getSizeInBytes() != 16 ||
2832 // See comments in performSTORECombine() for more details about
2833 // these conditions.
2834
2835 // Code that uses clang vector extensions can mark that it
2836 // wants unaligned accesses to be treated as fast by
2837 // underspecifying alignment to be 1 or 2.
2838 Alignment <= 2 ||
2839
2840 // Disregard v2i64. Memcpy lowering produces those and splitting
2841 // them regresses performance on micro-benchmarks and olden/bh.
2842 Ty == LLT::fixed_vector(2, 64);
2843 }
2844 return true;
2845}
2846
2847FastISel *
2849 const TargetLibraryInfo *libInfo) const {
2850 return AArch64::createFastISel(funcInfo, libInfo);
2851}
2852
2855 MachineBasicBlock *MBB) const {
2856 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2857 // phi node:
2858
2859 // OrigBB:
2860 // [... previous instrs leading to comparison ...]
2861 // b.ne TrueBB
2862 // b EndBB
2863 // TrueBB:
2864 // ; Fallthrough
2865 // EndBB:
2866 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2867
2868 MachineFunction *MF = MBB->getParent();
2869 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2870 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2871 DebugLoc DL = MI.getDebugLoc();
2872 MachineFunction::iterator It = ++MBB->getIterator();
2873
2874 Register DestReg = MI.getOperand(0).getReg();
2875 Register IfTrueReg = MI.getOperand(1).getReg();
2876 Register IfFalseReg = MI.getOperand(2).getReg();
2877 unsigned CondCode = MI.getOperand(3).getImm();
2878 bool NZCVKilled = MI.getOperand(4).isKill();
2879
2880 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2881 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2882 MF->insert(It, TrueBB);
2883 MF->insert(It, EndBB);
2884
2885 // Transfer rest of current basic-block to EndBB
2886 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2887 MBB->end());
2889
2890 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2891 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2892 MBB->addSuccessor(TrueBB);
2893 MBB->addSuccessor(EndBB);
2894
2895 // TrueBB falls through to the end.
2896 TrueBB->addSuccessor(EndBB);
2897
2898 if (!NZCVKilled) {
2899 TrueBB->addLiveIn(AArch64::NZCV);
2900 EndBB->addLiveIn(AArch64::NZCV);
2901 }
2902
2903 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2904 .addReg(IfTrueReg)
2905 .addMBB(TrueBB)
2906 .addReg(IfFalseReg)
2907 .addMBB(MBB);
2908
2909 MI.eraseFromParent();
2910 return EndBB;
2911}
2912
2920
2923 MachineBasicBlock *MBB) const {
2924 MachineFunction &MF = *MBB->getParent();
2925 MachineBasicBlock::iterator MBBI = MI.getIterator();
2926 const AArch64InstrInfo &TII =
2927 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2928 Register TargetReg = MI.getOperand(0).getReg();
2930 TII.probedStackAlloc(MBBI, TargetReg, false);
2931
2932 MI.eraseFromParent();
2933 return NextInst->getParent();
2934}
2935
2938 MachineBasicBlock *MBB) const {
2939 MachineFunction *MF = MBB->getParent();
2941
2942 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
2943 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
2944
2945 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
2946 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
2947 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
2948 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
2949
2950 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2951 DebugLoc DL = MI.getDebugLoc();
2952
2953 // RDVL requires GPR64, ADDSVL requires GPR64sp
2954 // We need to insert COPY instructions, these will later be removed by the
2955 // RegisterCoalescer
2956 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
2957 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
2958 .addReg(RegVL_GPR);
2959
2960 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
2961 .addReg(RegVL_GPRsp)
2962 .addImm(-1);
2963 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
2964 .addReg(RegSVL_GPRsp);
2965
2966 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2967 MachineFunction::iterator It = ++MBB->getIterator();
2968 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
2969 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
2970 MF->insert(It, TrapBB);
2971 MF->insert(It, PassBB);
2972
2973 // Continue if vector lengths match
2974 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
2975 .addReg(RegSVL_GPR)
2976 .addMBB(PassBB);
2977
2978 // Transfer rest of current BB to PassBB
2979 PassBB->splice(PassBB->begin(), MBB,
2980 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2982
2983 // Trap if vector lengths mismatch
2984 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
2985
2986 MBB->addSuccessor(TrapBB);
2987 MBB->addSuccessor(PassBB);
2988
2989 MI.eraseFromParent();
2990 return PassBB;
2991}
2992
2994AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2996 MachineBasicBlock *BB) const {
2997 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2998 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2999
3000 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3001 MIB.add(MI.getOperand(1)); // slice index register
3002 MIB.add(MI.getOperand(2)); // slice index offset
3003 MIB.add(MI.getOperand(3)); // pg
3004 MIB.add(MI.getOperand(4)); // base
3005 MIB.add(MI.getOperand(5)); // offset
3006
3007 MI.eraseFromParent(); // The pseudo is gone now.
3008 return BB;
3009}
3010
3013 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3015 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3016
3017 MIB.addReg(AArch64::ZA, RegState::Define);
3018 MIB.add(MI.getOperand(0)); // Vector select register
3019 MIB.add(MI.getOperand(1)); // Vector select offset
3020 MIB.add(MI.getOperand(2)); // Base
3021 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3022
3023 MI.eraseFromParent(); // The pseudo is gone now.
3024 return BB;
3025}
3026
3029 unsigned Opcode,
3030 bool Op0IsDef) const {
3031 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3033
3034 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3035 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3036 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3037 MIB.add(MI.getOperand(I));
3038
3039 MI.eraseFromParent(); // The pseudo is gone now.
3040 return BB;
3041}
3042
3044AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3046 MachineBasicBlock *BB) const {
3047 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3048 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3049 unsigned StartIdx = 0;
3050
3051 bool HasTile = BaseReg != AArch64::ZA;
3052 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3053 if (HasZPROut) {
3054 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3055 ++StartIdx;
3056 }
3057 if (HasTile) {
3058 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3059 RegState::Define); // Output ZA Tile
3060 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3061 StartIdx++;
3062 } else {
3063 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3064 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3065 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3066 ++StartIdx;
3067 }
3068 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3069 }
3070 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3071 MIB.add(MI.getOperand(I));
3072
3073 MI.eraseFromParent(); // The pseudo is gone now.
3074 return BB;
3075}
3076
3079 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3081 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3082 MIB.add(MI.getOperand(0)); // Mask
3083
3084 unsigned Mask = MI.getOperand(0).getImm();
3085 for (unsigned I = 0; I < 8; I++) {
3086 if (Mask & (1 << I))
3087 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3088 }
3089
3090 MI.eraseFromParent(); // The pseudo is gone now.
3091 return BB;
3092}
3093
3096 MachineBasicBlock *BB) const {
3097 MachineFunction *MF = BB->getParent();
3098 MachineFrameInfo &MFI = MF->getFrameInfo();
3100 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3101 if (TPIDR2.Uses > 0) {
3102 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3103 // generally don't support big-endian SVE/SME.
3104 if (!Subtarget->isLittleEndian())
3106 "TPIDR2 block initialization is not supported on big-endian targets");
3107
3108 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3109 // Store buffer pointer and num_za_save_slices.
3110 // Bytes 10-15 are implicitly zeroed.
3111 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3112 .addReg(MI.getOperand(0).getReg())
3113 .addReg(MI.getOperand(1).getReg())
3114 .addFrameIndex(TPIDR2.FrameIndex)
3115 .addImm(0);
3116 } else
3117 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3118
3119 BB->remove_instr(&MI);
3120 return BB;
3121}
3122
3125 MachineBasicBlock *BB) const {
3126 MachineFunction *MF = BB->getParent();
3127 MachineFrameInfo &MFI = MF->getFrameInfo();
3129 // TODO This function grows the stack with a subtraction, which doesn't work
3130 // on Windows. Some refactoring to share the functionality in
3131 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3132 // supports SME
3134 "Lazy ZA save is not yet supported on Windows");
3135
3136 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3137
3138 if (TPIDR2.Uses > 0) {
3139 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3141
3142 // The SUBXrs below won't always be emitted in a form that accepts SP
3143 // directly
3144 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3145 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3146 .addReg(AArch64::SP);
3147
3148 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3149 auto Size = MI.getOperand(1).getReg();
3150 auto Dest = MI.getOperand(0).getReg();
3151 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3152 .addReg(Size)
3153 .addReg(Size)
3154 .addReg(SP);
3155 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3156 AArch64::SP)
3157 .addReg(Dest);
3158
3159 // We have just allocated a variable sized object, tell this to PEI.
3160 MFI.CreateVariableSizedObject(Align(16), nullptr);
3161 }
3162
3163 BB->remove_instr(&MI);
3164 return BB;
3165}
3166
3167// TODO: Find a way to merge this with EmitAllocateZABuffer.
3170 MachineBasicBlock *BB) const {
3171 MachineFunction *MF = BB->getParent();
3172 MachineFrameInfo &MFI = MF->getFrameInfo();
3175 "Lazy ZA save is not yet supported on Windows");
3176
3177 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3178 if (FuncInfo->isSMESaveBufferUsed()) {
3179 // Allocate a buffer object of the size given by MI.getOperand(1).
3180 auto Size = MI.getOperand(1).getReg();
3181 auto Dest = MI.getOperand(0).getReg();
3182 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3183 .addReg(AArch64::SP)
3184 .addReg(Size)
3186 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3187 .addReg(AArch64::SP);
3188
3189 // We have just allocated a variable sized object, tell this to PEI.
3190 MFI.CreateVariableSizedObject(Align(16), nullptr);
3191 } else
3192 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3193 MI.getOperand(0).getReg());
3194
3195 BB->remove_instr(&MI);
3196 return BB;
3197}
3198
3201 MachineBasicBlock *BB) const {
3202 // If the buffer is used, emit a call to __arm_sme_state_size()
3203 MachineFunction *MF = BB->getParent();
3205 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3206 if (FuncInfo->isSMESaveBufferUsed()) {
3207 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3208 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3209 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3211 .addReg(AArch64::X0, RegState::ImplicitDefine)
3212 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3213 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3214 MI.getOperand(0).getReg())
3215 .addReg(AArch64::X0);
3216 } else
3217 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3218 MI.getOperand(0).getReg())
3219 .addReg(AArch64::XZR);
3220 BB->remove_instr(&MI);
3221 return BB;
3222}
3223
3226 MachineBasicBlock *BB) const {
3227 MachineFunction *MF = BB->getParent();
3228 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3229 const DebugLoc &DL = MI.getDebugLoc();
3230 Register ResultReg = MI.getOperand(0).getReg();
3231 if (MF->getRegInfo().use_empty(ResultReg)) {
3232 // Nothing to do. Pseudo erased below.
3233 } else if (Subtarget->hasSME()) {
3234 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3235 .addImm(AArch64SysReg::SVCR)
3236 .addReg(AArch64::VG, RegState::Implicit);
3237 } else {
3238 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3239 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3240 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3242 .addReg(AArch64::X0, RegState::ImplicitDefine)
3243 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3244 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3245 .addReg(AArch64::X0);
3246 }
3247 MI.eraseFromParent();
3248 return BB;
3249}
3250
3251// Helper function to find the instruction that defined a virtual register.
3252// If unable to find such instruction, returns nullptr.
3254 Register Reg) {
3255 while (Reg.isVirtual()) {
3256 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3257 assert(DefMI && "Virtual register definition not found");
3258 unsigned Opcode = DefMI->getOpcode();
3259
3260 if (Opcode == AArch64::COPY) {
3261 Reg = DefMI->getOperand(1).getReg();
3262 // Vreg is defined by copying from physreg.
3263 if (Reg.isPhysical())
3264 return DefMI;
3265 continue;
3266 }
3267 if (Opcode == AArch64::SUBREG_TO_REG) {
3268 Reg = DefMI->getOperand(2).getReg();
3269 continue;
3270 }
3271
3272 return DefMI;
3273 }
3274 return nullptr;
3275}
3276
3279 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3280 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3281 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3282 const DebugLoc &DL = MI.getDebugLoc();
3283
3284 Register AddrDisc = AddrDiscOp.getReg();
3285 int64_t IntDisc = IntDiscOp.getImm();
3286 assert(IntDisc == 0 && "Blend components are already expanded");
3287
3288 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3289 if (DiscMI) {
3290 switch (DiscMI->getOpcode()) {
3291 case AArch64::MOVKXi:
3292 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3293 // #imm should be an immediate and not a global symbol, for example.
3294 if (DiscMI->getOperand(2).isImm() &&
3295 DiscMI->getOperand(3).getImm() == 48) {
3296 AddrDisc = DiscMI->getOperand(1).getReg();
3297 IntDisc = DiscMI->getOperand(2).getImm();
3298 }
3299 break;
3300 case AArch64::MOVi32imm:
3301 case AArch64::MOVi64imm:
3302 // Small immediate integer constant passed via VReg.
3303 if (DiscMI->getOperand(1).isImm() &&
3304 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3305 AddrDisc = AArch64::NoRegister;
3306 IntDisc = DiscMI->getOperand(1).getImm();
3307 }
3308 break;
3309 }
3310 }
3311
3312 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3313 // in the requested register class.
3314 if (AddrDisc == AArch64::XZR)
3315 AddrDisc = AArch64::NoRegister;
3316
3317 // Make sure AddrDisc operand respects the register class imposed by MI.
3318 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3319 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3320 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3321 AddrDisc = TmpReg;
3322 }
3323
3324 AddrDiscOp.setReg(AddrDisc);
3325 IntDiscOp.setImm(IntDisc);
3326}
3327
3329 MachineInstr &MI, MachineBasicBlock *BB) const {
3330
3331 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3332 if (SMEOrigInstr != -1) {
3333 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3334 uint64_t SMEMatrixType =
3335 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3336 switch (SMEMatrixType) {
3338 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3340 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3342 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3344 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3346 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3348 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3349 }
3350 }
3351
3352 switch (MI.getOpcode()) {
3353 default:
3354#ifndef NDEBUG
3355 MI.dump();
3356#endif
3357 llvm_unreachable("Unexpected instruction for custom inserter!");
3358 case AArch64::InitTPIDR2Obj:
3359 return EmitInitTPIDR2Object(MI, BB);
3360 case AArch64::AllocateZABuffer:
3361 return EmitAllocateZABuffer(MI, BB);
3362 case AArch64::AllocateSMESaveBuffer:
3363 return EmitAllocateSMESaveBuffer(MI, BB);
3364 case AArch64::GetSMESaveSize:
3365 return EmitGetSMESaveSize(MI, BB);
3366 case AArch64::EntryPStateSM:
3367 return EmitEntryPStateSM(MI, BB);
3368 case AArch64::F128CSEL:
3369 return EmitF128CSEL(MI, BB);
3370 case TargetOpcode::STATEPOINT:
3371 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3372 // while bl call instruction (where statepoint will be lowered at the end)
3373 // has implicit def. This def is early-clobber as it will be set at
3374 // the moment of the call and earlier than any use is read.
3375 // Add this implicit dead def here as a workaround.
3376 MI.addOperand(*MI.getMF(),
3378 AArch64::LR, /*isDef*/ true,
3379 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3380 /*isUndef*/ false, /*isEarlyClobber*/ true));
3381 [[fallthrough]];
3382 case TargetOpcode::STACKMAP:
3383 case TargetOpcode::PATCHPOINT:
3384 return emitPatchPoint(MI, BB);
3385
3386 case TargetOpcode::PATCHABLE_EVENT_CALL:
3387 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3388 return BB;
3389
3390 case AArch64::CATCHRET:
3391 return EmitLoweredCatchRet(MI, BB);
3392
3393 case AArch64::PROBED_STACKALLOC_DYN:
3394 return EmitDynamicProbedAlloc(MI, BB);
3395
3396 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3397 return EmitCheckMatchingVL(MI, BB);
3398
3399 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3400 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3401 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3402 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3403 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3404 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3405 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3406 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3407 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3408 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3409 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3410 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3411 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3412 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3413 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3414 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3415 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3416 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3417 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3418 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3419 case AArch64::LDR_ZA_PSEUDO:
3420 return EmitFill(MI, BB);
3421 case AArch64::LDR_TX_PSEUDO:
3422 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3423 case AArch64::STR_TX_PSEUDO:
3424 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3425 case AArch64::ZERO_M_PSEUDO:
3426 return EmitZero(MI, BB);
3427 case AArch64::ZERO_T_PSEUDO:
3428 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3429 case AArch64::MOVT_TIZ_PSEUDO:
3430 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3431
3432 case AArch64::PAC:
3433 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3434 &AArch64::GPR64noipRegClass);
3435 return BB;
3436 }
3437}
3438
3439//===----------------------------------------------------------------------===//
3440// AArch64 Lowering private implementation.
3441//===----------------------------------------------------------------------===//
3442
3443//===----------------------------------------------------------------------===//
3444// Lowering Code
3445//===----------------------------------------------------------------------===//
3446
3447// Forward declarations of SVE fixed length lowering helpers
3452 SelectionDAG &DAG);
3455 EVT VT);
3456
3457/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3458static bool isZerosVector(const SDNode *N) {
3459 // Look through a bit convert.
3460 while (N->getOpcode() == ISD::BITCAST)
3461 N = N->getOperand(0).getNode();
3462
3464 return true;
3465
3466 if (N->getOpcode() != AArch64ISD::DUP)
3467 return false;
3468
3469 auto Opnd0 = N->getOperand(0);
3470 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3471}
3472
3473/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3474/// CC
3476 SDValue RHS = {}) {
3477 switch (CC) {
3478 default:
3479 llvm_unreachable("Unknown condition code!");
3480 case ISD::SETNE:
3481 return AArch64CC::NE;
3482 case ISD::SETEQ:
3483 return AArch64CC::EQ;
3484 case ISD::SETGT:
3485 return AArch64CC::GT;
3486 case ISD::SETGE:
3488 case ISD::SETLT:
3490 case ISD::SETLE:
3491 return AArch64CC::LE;
3492 case ISD::SETUGT:
3493 return AArch64CC::HI;
3494 case ISD::SETUGE:
3495 return AArch64CC::HS;
3496 case ISD::SETULT:
3497 return AArch64CC::LO;
3498 case ISD::SETULE:
3499 return AArch64CC::LS;
3500 }
3501}
3502
3503/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3505 AArch64CC::CondCode &CondCode,
3506 AArch64CC::CondCode &CondCode2) {
3507 CondCode2 = AArch64CC::AL;
3508 switch (CC) {
3509 default:
3510 llvm_unreachable("Unknown FP condition!");
3511 case ISD::SETEQ:
3512 case ISD::SETOEQ:
3513 CondCode = AArch64CC::EQ;
3514 break;
3515 case ISD::SETGT:
3516 case ISD::SETOGT:
3517 CondCode = AArch64CC::GT;
3518 break;
3519 case ISD::SETGE:
3520 case ISD::SETOGE:
3521 CondCode = AArch64CC::GE;
3522 break;
3523 case ISD::SETOLT:
3524 CondCode = AArch64CC::MI;
3525 break;
3526 case ISD::SETOLE:
3527 CondCode = AArch64CC::LS;
3528 break;
3529 case ISD::SETONE:
3530 CondCode = AArch64CC::MI;
3531 CondCode2 = AArch64CC::GT;
3532 break;
3533 case ISD::SETO:
3534 CondCode = AArch64CC::VC;
3535 break;
3536 case ISD::SETUO:
3537 CondCode = AArch64CC::VS;
3538 break;
3539 case ISD::SETUEQ:
3540 CondCode = AArch64CC::EQ;
3541 CondCode2 = AArch64CC::VS;
3542 break;
3543 case ISD::SETUGT:
3544 CondCode = AArch64CC::HI;
3545 break;
3546 case ISD::SETUGE:
3547 CondCode = AArch64CC::PL;
3548 break;
3549 case ISD::SETLT:
3550 case ISD::SETULT:
3551 CondCode = AArch64CC::LT;
3552 break;
3553 case ISD::SETLE:
3554 case ISD::SETULE:
3555 CondCode = AArch64CC::LE;
3556 break;
3557 case ISD::SETNE:
3558 case ISD::SETUNE:
3559 CondCode = AArch64CC::NE;
3560 break;
3561 }
3562}
3563
3564/// Convert a DAG fp condition code to an AArch64 CC.
3565/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3566/// should be AND'ed instead of OR'ed.
3568 AArch64CC::CondCode &CondCode,
3569 AArch64CC::CondCode &CondCode2) {
3570 CondCode2 = AArch64CC::AL;
3571 switch (CC) {
3572 default:
3573 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3574 assert(CondCode2 == AArch64CC::AL);
3575 break;
3576 case ISD::SETONE:
3577 // (a one b)
3578 // == ((a olt b) || (a ogt b))
3579 // == ((a ord b) && (a une b))
3580 CondCode = AArch64CC::VC;
3581 CondCode2 = AArch64CC::NE;
3582 break;
3583 case ISD::SETUEQ:
3584 // (a ueq b)
3585 // == ((a uno b) || (a oeq b))
3586 // == ((a ule b) && (a uge b))
3587 CondCode = AArch64CC::PL;
3588 CondCode2 = AArch64CC::LE;
3589 break;
3590 }
3591}
3592
3593/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3594/// CC usable with the vector instructions. Fewer operations are available
3595/// without a real NZCV register, so we have to use less efficient combinations
3596/// to get the same effect.
3598 AArch64CC::CondCode &CondCode,
3599 AArch64CC::CondCode &CondCode2,
3600 bool &Invert) {
3601 Invert = false;
3602 switch (CC) {
3603 default:
3604 // Mostly the scalar mappings work fine.
3605 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3606 break;
3607 case ISD::SETUO:
3608 Invert = true;
3609 [[fallthrough]];
3610 case ISD::SETO:
3611 CondCode = AArch64CC::MI;
3612 CondCode2 = AArch64CC::GE;
3613 break;
3614 case ISD::SETUEQ:
3615 case ISD::SETULT:
3616 case ISD::SETULE:
3617 case ISD::SETUGT:
3618 case ISD::SETUGE:
3619 // All of the compare-mask comparisons are ordered, but we can switch
3620 // between the two by a double inversion. E.g. ULE == !OGT.
3621 Invert = true;
3622 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3623 CondCode, CondCode2);
3624 break;
3625 }
3626}
3627
3628/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3630 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3631 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3632}
3633
3635 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3636 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3637 LLVM_DEBUG(dbgs() << "Is imm " << C
3638 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3639 return IsLegal;
3640}
3641
3643 // Works for negative immediates too, as it can be written as an ADDS
3644 // instruction with a negated immediate.
3645 return isLegalArithImmed(C.abs().getZExtValue());
3646}
3647
3649 uint64_t Imm = C.getZExtValue();
3651 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3652 return Insn.size();
3653}
3654
3656 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3657 if (Op->getFlags().hasNoSignedWrap())
3658 return true;
3659
3660 // We can still figure out if the second operand is safe to use
3661 // in a CMN instruction by checking if it is known to be not the minimum
3662 // signed value. If it is not, then we can safely use CMN.
3663 // Note: We can eventually remove this check and simply rely on
3664 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3665 // consistently sets them appropriately when making said nodes.
3666
3667 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3668 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3669}
3670
3671// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3672// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3673// can be set differently by this operation. It comes down to whether
3674// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3675// everything is fine. If not then the optimization is wrong. Thus general
3676// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3677//
3678// So, finally, the only LLVM-native comparisons that don't mention C or V
3679// are the ones that aren't unsigned comparisons. They're the only ones we can
3680// safely use CMN for in the absence of information about op2.
3682 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3683 (isIntEqualitySetCC(CC) ||
3684 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3685 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3686}
3687
3689 SelectionDAG &DAG, SDValue Chain,
3690 bool IsSignaling) {
3691 EVT VT = LHS.getValueType();
3692 assert(VT != MVT::f128);
3693
3694 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3695
3696 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3697 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3698 {Chain, LHS});
3699 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3700 {LHS.getValue(1), RHS});
3701 Chain = RHS.getValue(1);
3702 }
3703 unsigned Opcode =
3704 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3705 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3706}
3707
3709 const SDLoc &DL, SelectionDAG &DAG) {
3710 EVT VT = LHS.getValueType();
3711 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3712
3713 if (VT.isFloatingPoint()) {
3714 assert(VT != MVT::f128);
3715 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3716 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3717 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3718 }
3719 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3720 }
3721
3722 // The CMP instruction is just an alias for SUBS, and representing it as
3723 // SUBS means that it's possible to get CSE with subtract operations.
3724 // A later phase can perform the optimization of setting the destination
3725 // register to WZR/XZR if it ends up being unused.
3726 unsigned Opcode = AArch64ISD::SUBS;
3727
3728 if (isCMN(RHS, CC, DAG)) {
3729 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3730 Opcode = AArch64ISD::ADDS;
3731 RHS = RHS.getOperand(1);
3732 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3733 isIntEqualitySetCC(CC)) {
3734 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3735 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3736 Opcode = AArch64ISD::ADDS;
3737 LHS = LHS.getOperand(1);
3738 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3739 if (LHS.getOpcode() == ISD::AND) {
3740 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3741 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3742 // of the signed comparisons.
3743 const SDValue ANDSNode =
3744 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3745 LHS.getOperand(0), LHS.getOperand(1));
3746 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3747 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3748 return ANDSNode.getValue(1);
3749 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3750 // Use result of ANDS
3751 return LHS.getValue(1);
3752 }
3753 }
3754
3755 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3756 .getValue(1);
3757}
3758
3759/// \defgroup AArch64CCMP CMP;CCMP matching
3760///
3761/// These functions deal with the formation of CMP;CCMP;... sequences.
3762/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3763/// a comparison. They set the NZCV flags to a predefined value if their
3764/// predicate is false. This allows to express arbitrary conjunctions, for
3765/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3766/// expressed as:
3767/// cmp A
3768/// ccmp B, inv(CB), CA
3769/// check for CB flags
3770///
3771/// This naturally lets us implement chains of AND operations with SETCC
3772/// operands. And we can even implement some other situations by transforming
3773/// them:
3774/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3775/// negating the flags used in a CCMP/FCCMP operations.
3776/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3777/// by negating the flags we test for afterwards. i.e.
3778/// NEG (CMP CCMP CCCMP ...) can be implemented.
3779/// - Note that we can only ever negate all previously processed results.
3780/// What we can not implement by flipping the flags to test is a negation
3781/// of two sub-trees (because the negation affects all sub-trees emitted so
3782/// far, so the 2nd sub-tree we emit would also affect the first).
3783/// With those tools we can implement some OR operations:
3784/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3785/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3786/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3787/// elimination rules from earlier to implement the whole thing as a
3788/// CCMP/FCCMP chain.
3789///
3790/// As complete example:
3791/// or (or (setCA (cmp A)) (setCB (cmp B)))
3792/// (and (setCC (cmp C)) (setCD (cmp D)))"
3793/// can be reassociated to:
3794/// or (and (setCC (cmp C)) setCD (cmp D))
3795// (or (setCA (cmp A)) (setCB (cmp B)))
3796/// can be transformed to:
3797/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3798/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3799/// which can be implemented as:
3800/// cmp C
3801/// ccmp D, inv(CD), CC
3802/// ccmp A, CA, inv(CD)
3803/// ccmp B, CB, inv(CA)
3804/// check for CB flags
3805///
3806/// A counterexample is "or (and A B) (and C D)" which translates to
3807/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3808/// can only implement 1 of the inner (not) operations, but not both!
3809/// @{
3810
3811/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3813 ISD::CondCode CC, SDValue CCOp,
3815 AArch64CC::CondCode OutCC,
3816 const SDLoc &DL, SelectionDAG &DAG) {
3817 unsigned Opcode = 0;
3818 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3819
3820 if (LHS.getValueType().isFloatingPoint()) {
3821 assert(LHS.getValueType() != MVT::f128);
3822 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3823 LHS.getValueType() == MVT::bf16) {
3824 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3825 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3826 }
3827 Opcode = AArch64ISD::FCCMP;
3828 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3829 APInt Imm = Const->getAPIntValue();
3830 if (Imm.isNegative() && Imm.sgt(-32)) {
3831 Opcode = AArch64ISD::CCMN;
3832 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3833 }
3834 } else if (isCMN(RHS, CC, DAG)) {
3835 Opcode = AArch64ISD::CCMN;
3836 RHS = RHS.getOperand(1);
3837 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3838 isIntEqualitySetCC(CC)) {
3839 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3840 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3841 Opcode = AArch64ISD::CCMN;
3842 LHS = LHS.getOperand(1);
3843 }
3844 if (Opcode == 0)
3845 Opcode = AArch64ISD::CCMP;
3846
3847 SDValue Condition = getCondCode(DAG, Predicate);
3849 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3850 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3851 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3852}
3853
3854/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3855/// expressed as a conjunction. See \ref AArch64CCMP.
3856/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3857/// changing the conditions on the SETCC tests.
3858/// (this means we can call emitConjunctionRec() with
3859/// Negate==true on this sub-tree)
3860/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3861/// cannot do the negation naturally. We are required to
3862/// emit the subtree first in this case.
3863/// \param WillNegate Is true if are called when the result of this
3864/// subexpression must be negated. This happens when the
3865/// outer expression is an OR. We can use this fact to know
3866/// that we have a double negation (or (or ...) ...) that
3867/// can be implemented for free.
3868static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3869 bool &MustBeFirst, bool WillNegate,
3870 unsigned Depth = 0) {
3871 if (!Val.hasOneUse())
3872 return false;
3873 unsigned Opcode = Val->getOpcode();
3874 if (Opcode == ISD::SETCC) {
3875 if (Val->getOperand(0).getValueType() == MVT::f128)
3876 return false;
3877 CanNegate = true;
3878 MustBeFirst = false;
3879 return true;
3880 }
3881 // Protect against exponential runtime and stack overflow.
3882 if (Depth > 6)
3883 return false;
3884 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3885 bool IsOR = Opcode == ISD::OR;
3886 SDValue O0 = Val->getOperand(0);
3887 SDValue O1 = Val->getOperand(1);
3888 bool CanNegateL;
3889 bool MustBeFirstL;
3890 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3891 return false;
3892 bool CanNegateR;
3893 bool MustBeFirstR;
3894 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3895 return false;
3896
3897 if (MustBeFirstL && MustBeFirstR)
3898 return false;
3899
3900 if (IsOR) {
3901 // For an OR expression we need to be able to naturally negate at least
3902 // one side or we cannot do the transformation at all.
3903 if (!CanNegateL && !CanNegateR)
3904 return false;
3905 // If we the result of the OR will be negated and we can naturally negate
3906 // the leafs, then this sub-tree as a whole negates naturally.
3907 CanNegate = WillNegate && CanNegateL && CanNegateR;
3908 // If we cannot naturally negate the whole sub-tree, then this must be
3909 // emitted first.
3910 MustBeFirst = !CanNegate;
3911 } else {
3912 assert(Opcode == ISD::AND && "Must be OR or AND");
3913 // We cannot naturally negate an AND operation.
3914 CanNegate = false;
3915 MustBeFirst = MustBeFirstL || MustBeFirstR;
3916 }
3917 return true;
3918 }
3919 return false;
3920}
3921
3922/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3923/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3924/// Tries to transform the given i1 producing node @p Val to a series compare
3925/// and conditional compare operations. @returns an NZCV flags producing node
3926/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3927/// transformation was not possible.
3928/// \p Negate is true if we want this sub-tree being negated just by changing
3929/// SETCC conditions.
3931 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3933 // We're at a tree leaf, produce a conditional comparison operation.
3934 unsigned Opcode = Val->getOpcode();
3935 if (Opcode == ISD::SETCC) {
3936 SDValue LHS = Val->getOperand(0);
3937 SDValue RHS = Val->getOperand(1);
3938 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3939 bool isInteger = LHS.getValueType().isInteger();
3940 if (Negate)
3941 CC = getSetCCInverse(CC, LHS.getValueType());
3942 SDLoc DL(Val);
3943 // Determine OutCC and handle FP special case.
3944 if (isInteger) {
3945 OutCC = changeIntCCToAArch64CC(CC, RHS);
3946 } else {
3947 assert(LHS.getValueType().isFloatingPoint());
3948 AArch64CC::CondCode ExtraCC;
3949 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3950 // Some floating point conditions can't be tested with a single condition
3951 // code. Construct an additional comparison in this case.
3952 if (ExtraCC != AArch64CC::AL) {
3953 SDValue ExtraCmp;
3954 if (!CCOp.getNode())
3955 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3956 else
3957 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3958 ExtraCC, DL, DAG);
3959 CCOp = ExtraCmp;
3960 Predicate = ExtraCC;
3961 }
3962 }
3963
3964 // Produce a normal comparison if we are first in the chain
3965 if (!CCOp)
3966 return emitComparison(LHS, RHS, CC, DL, DAG);
3967 // Otherwise produce a ccmp.
3968 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3969 DAG);
3970 }
3971 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3972
3973 bool IsOR = Opcode == ISD::OR;
3974
3975 SDValue LHS = Val->getOperand(0);
3976 bool CanNegateL;
3977 bool MustBeFirstL;
3978 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3979 assert(ValidL && "Valid conjunction/disjunction tree");
3980 (void)ValidL;
3981
3982 SDValue RHS = Val->getOperand(1);
3983 bool CanNegateR;
3984 bool MustBeFirstR;
3985 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3986 assert(ValidR && "Valid conjunction/disjunction tree");
3987 (void)ValidR;
3988
3989 // Swap sub-tree that must come first to the right side.
3990 if (MustBeFirstL) {
3991 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3992 std::swap(LHS, RHS);
3993 std::swap(CanNegateL, CanNegateR);
3994 std::swap(MustBeFirstL, MustBeFirstR);
3995 }
3996
3997 bool NegateR;
3998 bool NegateAfterR;
3999 bool NegateL;
4000 bool NegateAfterAll;
4001 if (Opcode == ISD::OR) {
4002 // Swap the sub-tree that we can negate naturally to the left.
4003 if (!CanNegateL) {
4004 assert(CanNegateR && "at least one side must be negatable");
4005 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4006 assert(!Negate);
4007 std::swap(LHS, RHS);
4008 NegateR = false;
4009 NegateAfterR = true;
4010 } else {
4011 // Negate the left sub-tree if possible, otherwise negate the result.
4012 NegateR = CanNegateR;
4013 NegateAfterR = !CanNegateR;
4014 }
4015 NegateL = true;
4016 NegateAfterAll = !Negate;
4017 } else {
4018 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4019 assert(!Negate && "Valid conjunction/disjunction tree");
4020
4021 NegateL = false;
4022 NegateR = false;
4023 NegateAfterR = false;
4024 NegateAfterAll = false;
4025 }
4026
4027 // Emit sub-trees.
4028 AArch64CC::CondCode RHSCC;
4029 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4030 if (NegateAfterR)
4031 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4032 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4033 if (NegateAfterAll)
4034 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4035 return CmpL;
4036}
4037
4038/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4039/// In some cases this is even possible with OR operations in the expression.
4040/// See \ref AArch64CCMP.
4041/// \see emitConjunctionRec().
4043 AArch64CC::CondCode &OutCC) {
4044 bool DummyCanNegate;
4045 bool DummyMustBeFirst;
4046 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
4047 return SDValue();
4048
4049 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4050}
4051
4052/// @}
4053
4054/// Returns how profitable it is to fold a comparison's operand's shift and/or
4055/// extension operations.
4057 auto isSupportedExtend = [&](SDValue V) {
4058 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4059 return true;
4060
4061 if (V.getOpcode() == ISD::AND)
4062 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4063 uint64_t Mask = MaskCst->getZExtValue();
4064 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4065 }
4066
4067 return false;
4068 };
4069
4070 if (!Op.hasOneUse())
4071 return 0;
4072
4073 if (isSupportedExtend(Op))
4074 return 1;
4075
4076 unsigned Opc = Op.getOpcode();
4077 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4078 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4079 uint64_t Shift = ShiftCst->getZExtValue();
4080 if (isSupportedExtend(Op.getOperand(0)))
4081 return (Shift <= 4) ? 2 : 1;
4082 EVT VT = Op.getValueType();
4083 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4084 return 1;
4085 }
4086
4087 return 0;
4088}
4089
4090// emitComparison() converts comparison with one or negative one to comparison
4091// with 0. Note that this only works for signed comparisons because of how ANDS
4092// works.
4094 // Only works for ANDS and AND.
4095 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4096 return false;
4097
4098 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4099 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4100 return true;
4101 }
4102
4103 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4104 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4105 return true;
4106 }
4107
4108 return false;
4109}
4110
4112 SDValue &AArch64cc, SelectionDAG &DAG,
4113 const SDLoc &DL) {
4114 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4115 EVT VT = RHS.getValueType();
4116 APInt C = RHSC->getAPIntValue();
4117 // shouldBeAdjustedToZero is a special case to better fold with
4118 // emitComparison().
4119 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4120 // Adjust the constant to zero.
4121 // CC has already been adjusted.
4122 RHS = DAG.getConstant(0, DL, VT);
4123 } else if (!isLegalCmpImmed(C)) {
4124 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4125 // Constant does not fit, try adjusting it by one?
4126 switch (CC) {
4127 default:
4128 break;
4129 case ISD::SETLT:
4130 case ISD::SETGE:
4131 if (!C.isMinSignedValue()) {
4132 APInt CMinusOne = C - 1;
4133 if (isLegalCmpImmed(CMinusOne) ||
4134 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4135 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4136 RHS = DAG.getConstant(CMinusOne, DL, VT);
4137 }
4138 }
4139 break;
4140 case ISD::SETULT:
4141 case ISD::SETUGE: {
4142 // C is not 0 because it is a legal immediate.
4143 assert(!C.isZero() && "C should not be zero here");
4144 APInt CMinusOne = C - 1;
4145 if (isLegalCmpImmed(CMinusOne) ||
4146 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4147 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4148 RHS = DAG.getConstant(CMinusOne, DL, VT);
4149 }
4150 break;
4151 }
4152 case ISD::SETLE:
4153 case ISD::SETGT:
4154 if (!C.isMaxSignedValue()) {
4155 APInt CPlusOne = C + 1;
4156 if (isLegalCmpImmed(CPlusOne) ||
4157 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4158 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4159 RHS = DAG.getConstant(CPlusOne, DL, VT);
4160 }
4161 }
4162 break;
4163 case ISD::SETULE:
4164 case ISD::SETUGT: {
4165 if (!C.isAllOnes()) {
4166 APInt CPlusOne = C + 1;
4167 if (isLegalCmpImmed(CPlusOne) ||
4168 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4169 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4170 RHS = DAG.getConstant(CPlusOne, DL, VT);
4171 }
4172 }
4173 break;
4174 }
4175 }
4176 }
4177 }
4178
4179 // Comparisons are canonicalized so that the RHS operand is simpler than the
4180 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4181 // can fold some shift+extend operations on the RHS operand, so swap the
4182 // operands if that can be done.
4183 //
4184 // For example:
4185 // lsl w13, w11, #1
4186 // cmp w13, w12
4187 // can be turned into:
4188 // cmp w12, w11, lsl #1
4189 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4190 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4191 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4192 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4193 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4194
4195 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4196 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4197 std::swap(LHS, RHS);
4199 }
4200 }
4201
4202 SDValue Cmp;
4204 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4206
4207 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4208 // For the i8 operand, the largest immediate is 255, so this can be easily
4209 // encoded in the compare instruction. For the i16 operand, however, the
4210 // largest immediate cannot be encoded in the compare.
4211 // Therefore, use a sign extending load and cmn to avoid materializing the
4212 // -1 constant. For example,
4213 // movz w1, #65535
4214 // ldrh w0, [x0, #0]
4215 // cmp w0, w1
4216 // >
4217 // ldrsh w0, [x0, #0]
4218 // cmn w0, #1
4219 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4220 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4221 // ensure both the LHS and RHS are truly zero extended and to make sure the
4222 // transformation is profitable.
4223 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4224 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4225 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4226 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4227 int16_t ValueofRHS = RHS->getAsZExtVal();
4228 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4229 SDValue SExt =
4230 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4231 DAG.getValueType(MVT::i16));
4232 Cmp = emitComparison(
4233 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4234 DL, DAG);
4236 }
4237 }
4238
4239 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4240 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4241 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4243 }
4244 }
4245 }
4246
4247 if (!Cmp) {
4248 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4250 }
4251 AArch64cc = getCondCode(DAG, AArch64CC);
4252 return Cmp;
4253}
4254
4255static std::pair<SDValue, SDValue>
4257 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4258 "Unsupported value type");
4259 SDValue Value, Overflow;
4260 SDLoc DL(Op);
4261 SDValue LHS = Op.getOperand(0);
4262 SDValue RHS = Op.getOperand(1);
4263 unsigned Opc = 0;
4264 switch (Op.getOpcode()) {
4265 default:
4266 llvm_unreachable("Unknown overflow instruction!");
4267 case ISD::SADDO:
4268 Opc = AArch64ISD::ADDS;
4269 CC = AArch64CC::VS;
4270 break;
4271 case ISD::UADDO:
4272 Opc = AArch64ISD::ADDS;
4273 CC = AArch64CC::HS;
4274 break;
4275 case ISD::SSUBO:
4276 Opc = AArch64ISD::SUBS;
4277 CC = AArch64CC::VS;
4278 break;
4279 case ISD::USUBO:
4280 Opc = AArch64ISD::SUBS;
4281 CC = AArch64CC::LO;
4282 break;
4283 // Multiply needs a little bit extra work.
4284 case ISD::SMULO:
4285 case ISD::UMULO: {
4286 CC = AArch64CC::NE;
4287 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4288 if (Op.getValueType() == MVT::i32) {
4289 // Extend to 64-bits, then perform a 64-bit multiply.
4290 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4291 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4292 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4293 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4294 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4295
4296 // Check that the result fits into a 32-bit integer.
4297 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4298 if (IsSigned) {
4299 // cmp xreg, wreg, sxtw
4300 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4301 Overflow =
4302 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4303 } else {
4304 // tst xreg, #0xffffffff00000000
4305 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4306 Overflow =
4307 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4308 }
4309 break;
4310 }
4311 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4312 // For the 64 bit multiply
4313 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4314 if (IsSigned) {
4315 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4316 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4317 DAG.getConstant(63, DL, MVT::i64));
4318 // It is important that LowerBits is last, otherwise the arithmetic
4319 // shift will not be folded into the compare (SUBS).
4320 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4321 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4322 .getValue(1);
4323 } else {
4324 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4325 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4326 Overflow =
4327 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4328 DAG.getConstant(0, DL, MVT::i64),
4329 UpperBits).getValue(1);
4330 }
4331 break;
4332 }
4333 } // switch (...)
4334
4335 if (Opc) {
4336 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4337
4338 // Emit the AArch64 operation with overflow check.
4339 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4340 Overflow = Value.getValue(1);
4341 }
4342 return std::make_pair(Value, Overflow);
4343}
4344
4345SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4346 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4347 !Subtarget->isNeonAvailable()))
4348 return LowerToScalableOp(Op, DAG);
4349
4350 SDValue Sel = Op.getOperand(0);
4351 SDValue Other = Op.getOperand(1);
4352 SDLoc DL(Sel);
4353
4354 // If the operand is an overflow checking operation, invert the condition
4355 // code and kill the Not operation. I.e., transform:
4356 // (xor (overflow_op_bool, 1))
4357 // -->
4358 // (csel 1, 0, invert(cc), overflow_op_bool)
4359 // ... which later gets transformed to just a cset instruction with an
4360 // inverted condition code, rather than a cset + eor sequence.
4362 // Only lower legal XALUO ops.
4364 return SDValue();
4365
4366 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4367 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4369 SDValue Value, Overflow;
4370 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4371 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4372 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4373 CCVal, Overflow);
4374 }
4375 // If neither operand is a SELECT_CC, give up.
4376 if (Sel.getOpcode() != ISD::SELECT_CC)
4377 std::swap(Sel, Other);
4378 if (Sel.getOpcode() != ISD::SELECT_CC)
4379 return Op;
4380
4381 // The folding we want to perform is:
4382 // (xor x, (select_cc a, b, cc, 0, -1) )
4383 // -->
4384 // (csel x, (xor x, -1), cc ...)
4385 //
4386 // The latter will get matched to a CSINV instruction.
4387
4388 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4389 SDValue LHS = Sel.getOperand(0);
4390 SDValue RHS = Sel.getOperand(1);
4391 SDValue TVal = Sel.getOperand(2);
4392 SDValue FVal = Sel.getOperand(3);
4393
4394 // FIXME: This could be generalized to non-integer comparisons.
4395 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4396 return Op;
4397
4398 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4399 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4400
4401 // The values aren't constants, this isn't the pattern we're looking for.
4402 if (!CFVal || !CTVal)
4403 return Op;
4404
4405 // We can commute the SELECT_CC by inverting the condition. This
4406 // might be needed to make this fit into a CSINV pattern.
4407 if (CTVal->isAllOnes() && CFVal->isZero()) {
4408 std::swap(TVal, FVal);
4409 std::swap(CTVal, CFVal);
4410 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4411 }
4412
4413 // If the constants line up, perform the transform!
4414 if (CTVal->isZero() && CFVal->isAllOnes()) {
4415 SDValue CCVal;
4416 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4417
4418 FVal = Other;
4419 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4420 DAG.getAllOnesConstant(DL, Other.getValueType()));
4421
4422 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4423 CCVal, Cmp);
4424 }
4425
4426 return Op;
4427}
4428
4429// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4430// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4431// sets 'C' bit to 0.
4433 SDLoc DL(Value);
4434 EVT VT = Value.getValueType();
4435 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4436 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4437 SDValue Cmp =
4438 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4439 return Cmp.getValue(1);
4440}
4441
4442// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4443// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4445 bool Invert) {
4446 assert(Glue.getResNo() == 1);
4447 SDLoc DL(Glue);
4448 SDValue Zero = DAG.getConstant(0, DL, VT);
4449 SDValue One = DAG.getConstant(1, DL, VT);
4451 SDValue CC = getCondCode(DAG, Cond);
4452 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4453}
4454
4455// Value is 1 if 'V' bit of NZCV is 1, else 0
4457 assert(Glue.getResNo() == 1);
4458 SDLoc DL(Glue);
4459 SDValue Zero = DAG.getConstant(0, DL, VT);
4460 SDValue One = DAG.getConstant(1, DL, VT);
4462 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4463}
4464
4465// This lowering is inefficient, but it will get cleaned up by
4466// `foldOverflowCheck`
4468 unsigned Opcode, bool IsSigned) {
4469 EVT VT0 = Op.getValue(0).getValueType();
4470 EVT VT1 = Op.getValue(1).getValueType();
4471
4472 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4473 return SDValue();
4474
4475 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4476 SDValue OpLHS = Op.getOperand(0);
4477 SDValue OpRHS = Op.getOperand(1);
4478 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4479
4480 SDLoc DL(Op);
4481
4482 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4483 OpRHS, OpCarryIn);
4484
4485 SDValue OutFlag =
4486 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4487 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4488
4489 return DAG.getMergeValues({Sum, OutFlag}, DL);
4490}
4491
4493 // Let legalize expand this if it isn't a legal type yet.
4494 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4495 return SDValue();
4496
4497 SDLoc DL(Op);
4499 // The actual operation that sets the overflow or carry flag.
4500 SDValue Value, Overflow;
4501 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4502
4503 // We use 0 and 1 as false and true values.
4504 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4505 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4506
4507 // We use an inverted condition, because the conditional select is inverted
4508 // too. This will allow it to be selected to a single instruction:
4509 // CSINC Wd, WZR, WZR, invert(cond).
4510 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4511 Overflow =
4512 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4513
4514 return DAG.getMergeValues({Value, Overflow}, DL);
4515}
4516
4517// Prefetch operands are:
4518// 1: Address to prefetch
4519// 2: bool isWrite
4520// 3: int locality (0 = no locality ... 3 = extreme locality)
4521// 4: bool isDataCache
4523 SDLoc DL(Op);
4524 unsigned IsWrite = Op.getConstantOperandVal(2);
4525 unsigned Locality = Op.getConstantOperandVal(3);
4526 unsigned IsData = Op.getConstantOperandVal(4);
4527
4528 bool IsStream = !Locality;
4529 // When the locality number is set
4530 if (Locality) {
4531 // The front-end should have filtered out the out-of-range values
4532 assert(Locality <= 3 && "Prefetch locality out-of-range");
4533 // The locality degree is the opposite of the cache speed.
4534 // Put the number the other way around.
4535 // The encoding starts at 0 for level 1
4536 Locality = 3 - Locality;
4537 }
4538
4539 // built the mask value encoding the expected behavior.
4540 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4541 (!IsData << 3) | // IsDataCache bit
4542 (Locality << 1) | // Cache level bits
4543 (unsigned)IsStream; // Stream bit
4544 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4545 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4546 Op.getOperand(1));
4547}
4548
4549// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4550// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4551// (AND X Y) Z which produces a better opt with EmitComparison
4553 SelectionDAG &DAG, const SDLoc DL) {
4554 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4555 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4557 if (LHSConstOp && RHSConst) {
4558 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4559 uint64_t RHSConstant = RHSConst->getZExtValue();
4560 if (isPowerOf2_64(RHSConstant)) {
4561 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4562 LHS =
4563 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4564 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4565 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4566 CC = ISD::SETEQ;
4567 }
4568 }
4569 }
4570}
4571
4572SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4573 SelectionDAG &DAG) const {
4574 EVT VT = Op.getValueType();
4575 if (VT.isScalableVector()) {
4576 SDValue SrcVal = Op.getOperand(0);
4577
4578 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4579 // Break conversion in two with the first part converting to f32 and the
4580 // second using native f32->VT instructions.
4581 SDLoc DL(Op);
4582 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4583 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4584 }
4585
4586 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4587 }
4588
4589 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4590 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4591
4592 bool IsStrict = Op->isStrictFPOpcode();
4593 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4594 EVT Op0VT = Op0.getValueType();
4595 if (VT == MVT::f64) {
4596 // FP16->FP32 extends are legal for v32 and v4f32.
4597 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4598 return Op;
4599 // Split bf16->f64 extends into two fpextends.
4600 if (Op0VT == MVT::bf16 && IsStrict) {
4601 SDValue Ext1 =
4602 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4603 {Op0, Op.getOperand(0)});
4604 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4605 {Ext1, Ext1.getValue(1)});
4606 }
4607 if (Op0VT == MVT::bf16)
4608 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4609 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4610 return SDValue();
4611 }
4612
4613 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4614 return SDValue();
4615}
4616
4617SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4618 SelectionDAG &DAG) const {
4619 EVT VT = Op.getValueType();
4620 bool IsStrict = Op->isStrictFPOpcode();
4621 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4622 EVT SrcVT = SrcVal.getValueType();
4623 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4624
4625 if (VT.isScalableVector()) {
4626 // Let common code split the operation.
4627 if (SrcVT == MVT::nxv8f32)
4628 return Op;
4629
4630 if (VT.getScalarType() != MVT::bf16)
4631 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4632
4633 SDLoc DL(Op);
4634 constexpr EVT I32 = MVT::nxv4i32;
4635 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4636
4637 SDValue NaN;
4638 SDValue Narrow;
4639
4640 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4641 if (Subtarget->hasBF16())
4642 return LowerToPredicatedOp(Op, DAG,
4643 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4644
4645 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4646
4647 // Set the quiet bit.
4648 if (!DAG.isKnownNeverSNaN(SrcVal))
4649 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4650 } else if (SrcVT == MVT::nxv2f64 &&
4651 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4652 // Round to float without introducing rounding errors and try again.
4653 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4654 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4655 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4656
4658 if (IsStrict)
4659 NewOps.push_back(Op.getOperand(0));
4660 NewOps.push_back(Narrow);
4661 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4662 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4663 } else
4664 return SDValue();
4665
4666 if (!Trunc) {
4667 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4668 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4669 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4670 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4671 }
4672
4673 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4674 // 0x80000000.
4675 if (NaN) {
4676 EVT I1 = I32.changeElementType(MVT::i1);
4677 EVT CondVT = VT.changeElementType(MVT::i1);
4678 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4679 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4680 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4681 }
4682
4683 // Now that we have rounded, shift the bits into position.
4684 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4685 return getSVESafeBitCast(VT, Narrow, DAG);
4686 }
4687
4688 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4689 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4690
4691 // Expand cases where the result type is BF16 but we don't have hardware
4692 // instructions to lower it.
4693 if (VT.getScalarType() == MVT::bf16 &&
4694 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4695 Subtarget->hasBF16())) {
4696 SDLoc DL(Op);
4697 SDValue Narrow = SrcVal;
4698 SDValue NaN;
4699 EVT I32 = SrcVT.changeElementType(MVT::i32);
4700 EVT F32 = SrcVT.changeElementType(MVT::f32);
4701 if (SrcVT.getScalarType() == MVT::f32) {
4702 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4703 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4704 if (!NeverSNaN) {
4705 // Set the quiet bit.
4706 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4707 DAG.getConstant(0x400000, DL, I32));
4708 }
4709 } else if (SrcVT.getScalarType() == MVT::f64) {
4710 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4711 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4712 } else {
4713 return SDValue();
4714 }
4715 if (!Trunc) {
4716 SDValue One = DAG.getConstant(1, DL, I32);
4717 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4718 DAG.getShiftAmountConstant(16, I32, DL));
4719 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4720 SDValue RoundingBias =
4721 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4722 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4723 }
4724
4725 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4726 // 0x80000000.
4727 if (NaN) {
4728 SDValue IsNaN = DAG.getSetCC(
4729 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4730 SrcVal, SrcVal, ISD::SETUO);
4731 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4732 }
4733
4734 // Now that we have rounded, shift the bits into position.
4735 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4736 DAG.getShiftAmountConstant(16, I32, DL));
4737 if (VT.isVector()) {
4738 EVT I16 = I32.changeVectorElementType(MVT::i16);
4739 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4740 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4741 }
4742 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4743 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4744 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4745 : Result;
4746 }
4747
4748 if (SrcVT != MVT::f128) {
4749 // Expand cases where the input is a vector bigger than NEON.
4751 return SDValue();
4752
4753 // It's legal except when f128 is involved
4754 return Op;
4755 }
4756
4757 return SDValue();
4758}
4759
4760SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4761 SelectionDAG &DAG) const {
4762 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4763 // Any additional optimization in this function should be recorded
4764 // in the cost tables.
4765 bool IsStrict = Op->isStrictFPOpcode();
4766 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4767 EVT VT = Op.getValueType();
4768
4769 assert(!(IsStrict && VT.isScalableVector()) &&
4770 "Unimplemented SVE support for STRICT_FP_to_INT!");
4771
4772 // f16 conversions are promoted to f32 when full fp16 is not supported.
4773 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4774 InVT.getVectorElementType() == MVT::bf16) {
4775 EVT NewVT = VT.changeElementType(MVT::f32);
4776 SDLoc DL(Op);
4777 if (IsStrict) {
4778 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4779 {Op.getOperand(0), Op.getOperand(1)});
4780 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4781 {Ext.getValue(1), Ext.getValue(0)});
4782 }
4783 return DAG.getNode(
4784 Op.getOpcode(), DL, Op.getValueType(),
4785 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4786 }
4787
4788 if (VT.isScalableVector()) {
4789 if (VT.getVectorElementType() == MVT::i1) {
4790 SDLoc DL(Op);
4791 EVT CvtVT = getPromotedVTForPredicate(VT);
4792 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4793 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4794 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4795 }
4796
4797 // Let common code split the operation.
4798 if (InVT == MVT::nxv8f32)
4799 return Op;
4800
4801 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4802 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4803 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4804 return LowerToPredicatedOp(Op, DAG, Opcode);
4805 }
4806
4807 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4808 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4809 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4810
4811 uint64_t VTSize = VT.getFixedSizeInBits();
4812 uint64_t InVTSize = InVT.getFixedSizeInBits();
4813 if (VTSize < InVTSize) {
4814 SDLoc DL(Op);
4815 if (IsStrict) {
4817 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4818 {Op.getOperand(0), Op.getOperand(1)});
4819 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4820 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4821 }
4822 SDValue Cv =
4823 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4824 Op.getOperand(0));
4825 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4826 }
4827
4828 if (VTSize > InVTSize) {
4829 SDLoc DL(Op);
4830 MVT ExtVT =
4833 if (IsStrict) {
4834 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4835 {Op.getOperand(0), Op.getOperand(1)});
4836 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4837 {Ext.getValue(1), Ext.getValue(0)});
4838 }
4839 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4840 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4841 }
4842
4843 // Use a scalar operation for conversions between single-element vectors of
4844 // the same size.
4845 if (InVT.getVectorNumElements() == 1) {
4846 SDLoc DL(Op);
4847 SDValue Extract = DAG.getNode(
4849 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4850 EVT ScalarVT = VT.getScalarType();
4851 if (IsStrict)
4852 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4853 {Op.getOperand(0), Extract});
4854 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4855 }
4856
4857 // Type changing conversions are illegal.
4858 return Op;
4859}
4860
4861SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4862 SelectionDAG &DAG) const {
4863 bool IsStrict = Op->isStrictFPOpcode();
4864 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4865
4866 if (SrcVal.getValueType().isVector())
4867 return LowerVectorFP_TO_INT(Op, DAG);
4868
4869 // f16 conversions are promoted to f32 when full fp16 is not supported.
4870 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4871 SrcVal.getValueType() == MVT::bf16) {
4872 SDLoc DL(Op);
4873 if (IsStrict) {
4874 SDValue Ext =
4875 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4876 {Op.getOperand(0), SrcVal});
4877 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4878 {Ext.getValue(1), Ext.getValue(0)});
4879 }
4880 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4881 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4882 }
4883
4884 if (SrcVal.getValueType() != MVT::f128) {
4885 // It's legal except when f128 is involved
4886 return Op;
4887 }
4888
4889 return SDValue();
4890}
4891
4892SDValue
4893AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4894 SelectionDAG &DAG) const {
4895 // AArch64 FP-to-int conversions saturate to the destination element size, so
4896 // we can lower common saturating conversions to simple instructions.
4897 SDValue SrcVal = Op.getOperand(0);
4898 EVT SrcVT = SrcVal.getValueType();
4899 EVT DstVT = Op.getValueType();
4900 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4901
4902 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4903 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4904 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4905 assert(SatWidth <= DstElementWidth &&
4906 "Saturation width cannot exceed result width");
4907
4908 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4909 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4910 // types, so this is hard to reach.
4911 if (DstVT.isScalableVector())
4912 return SDValue();
4913
4914 EVT SrcElementVT = SrcVT.getVectorElementType();
4915
4916 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4917 SDLoc DL(Op);
4918 SDValue SrcVal2;
4919 if ((SrcElementVT == MVT::f16 &&
4920 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4921 SrcElementVT == MVT::bf16) {
4922 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4923 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4924 // If we are extending to a v8f32, split into two v4f32 to produce legal
4925 // types.
4926 if (F32VT.getSizeInBits() > 128) {
4927 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4928 F32VT = F32VT.getHalfNumVectorElementsVT();
4929 }
4930 SrcVT = F32VT;
4931 SrcElementVT = MVT::f32;
4932 SrcElementWidth = 32;
4933 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4934 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4935 return SDValue();
4936
4937 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4938 // width and produce a fcvtzu.
4939 if (SatWidth == 64 && SrcElementWidth < 64) {
4940 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4941 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4942 SrcVT = F64VT;
4943 SrcElementVT = MVT::f64;
4944 SrcElementWidth = 64;
4945 }
4946 // Cases that we can emit directly.
4947 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4948 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4949 DAG.getValueType(DstVT.getScalarType()));
4950 if (SrcVal2) {
4951 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4952 DAG.getValueType(DstVT.getScalarType()));
4953 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4954 }
4955 return Res;
4956 }
4957
4958 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4959 // result. This is only valid if the legal cvt is larger than the saturate
4960 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4961 // (at least until sqxtn is selected).
4962 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4963 return SDValue();
4964
4965 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4966 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4967 DAG.getValueType(IntVT.getScalarType()));
4968 SDValue NativeCvt2 =
4969 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4970 DAG.getValueType(IntVT.getScalarType()))
4971 : SDValue();
4972 SDValue Sat, Sat2;
4973 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4974 SDValue MinC = DAG.getConstant(
4975 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4976 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4977 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4978 SDValue MaxC = DAG.getConstant(
4979 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4980 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4981 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4982 } else {
4983 SDValue MinC = DAG.getConstant(
4984 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4985 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4986 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4987 }
4988
4989 if (SrcVal2)
4990 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4992 Sat, Sat2);
4993
4994 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4995}
4996
4997SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4998 SelectionDAG &DAG) const {
4999 // AArch64 FP-to-int conversions saturate to the destination register size, so
5000 // we can lower common saturating conversions to simple instructions.
5001 SDValue SrcVal = Op.getOperand(0);
5002 EVT SrcVT = SrcVal.getValueType();
5003
5004 if (SrcVT.isVector())
5005 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5006
5007 EVT DstVT = Op.getValueType();
5008 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5009 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5010 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5011 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5012
5013 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5014 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5015 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5016 SrcVT = MVT::f32;
5017 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5018 SrcVT != MVT::bf16)
5019 return SDValue();
5020
5021 SDLoc DL(Op);
5022 // Cases that we can emit directly.
5023 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5024 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5025 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5026 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5027 DAG.getValueType(DstVT));
5028
5029 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5030 // result. This is only valid if the legal cvt is larger than the saturate
5031 // width.
5032 if (DstWidth < SatWidth)
5033 return SDValue();
5034
5035 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5036 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5037 SDValue CVTf32 =
5038 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5039 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5040 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5041 DAG.getValueType(SatVT));
5042 }
5043 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5044 return DAG.getBitcast(DstVT, CVTf32);
5045 }
5046
5047 SDValue NativeCvt =
5048 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5049 SDValue Sat;
5050 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5051 SDValue MinC = DAG.getConstant(
5052 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5053 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5054 SDValue MaxC = DAG.getConstant(
5055 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5056 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5057 } else {
5058 SDValue MinC = DAG.getConstant(
5059 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5060 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5061 }
5062
5063 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5064}
5065
5066SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5067 SelectionDAG &DAG) const {
5068 EVT VT = Op.getValueType();
5069 SDValue Src = Op.getOperand(0);
5070 SDLoc DL(Op);
5071
5072 assert(VT.isVector() && "Expected vector type");
5073
5074 EVT CastVT =
5075 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5076
5077 // Round the floating-point value into a floating-point register with the
5078 // current rounding mode.
5079 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5080
5081 // Truncate the rounded floating point to an integer.
5082 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5084}
5085
5086SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5087 SelectionDAG &DAG) const {
5088 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5089 // Any additional optimization in this function should be recorded
5090 // in the cost tables.
5091 bool IsStrict = Op->isStrictFPOpcode();
5092 EVT VT = Op.getValueType();
5093 SDLoc DL(Op);
5094 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5095 EVT InVT = In.getValueType();
5096 unsigned Opc = Op.getOpcode();
5097 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5098
5099 assert(!(IsStrict && VT.isScalableVector()) &&
5100 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5101
5102 // NOTE: i1->bf16 does not require promotion to f32.
5103 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5104 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5105 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5106 : DAG.getConstantFP(1.0, DL, VT);
5107 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5108 }
5109
5110 // Promote bf16 conversions to f32.
5111 if (VT.getVectorElementType() == MVT::bf16) {
5112 EVT F32 = VT.changeElementType(MVT::f32);
5113 if (IsStrict) {
5114 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5115 {Op.getOperand(0), In});
5116 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5117 {Op.getValueType(), MVT::Other},
5118 {Val.getValue(1), Val.getValue(0),
5119 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5120 }
5121 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5122 DAG.getNode(Op.getOpcode(), DL, F32, In),
5123 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5124 }
5125
5126 if (VT.isScalableVector()) {
5127 // Let common code split the operation.
5128 if (VT == MVT::nxv8f32)
5129 return Op;
5130
5131 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5132 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5133 return LowerToPredicatedOp(Op, DAG, Opcode);
5134 }
5135
5136 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5137 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5138 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5139
5140 uint64_t VTSize = VT.getFixedSizeInBits();
5141 uint64_t InVTSize = InVT.getFixedSizeInBits();
5142 if (VTSize < InVTSize) {
5143 // AArch64 doesn't have a direct vector instruction to convert
5144 // fixed point to floating point AND narrow it at the same time.
5145 // Additional rounding when the target is f32/f64 causes double
5146 // rounding issues. Conversion to f16 is fine due to narrow width.
5147 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5148 bool IsTargetf16 = false;
5149 if (Op.hasOneUse() &&
5150 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5151 // Some vector types are split during legalization into half, followed by
5152 // concatenation, followed by rounding to the original vector type. If we
5153 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5154 SDNode *U = *Op->user_begin();
5155 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5156 EVT TmpVT = U->user_begin()->getValueType(0);
5157 if (TmpVT.getScalarType() == MVT::f16)
5158 IsTargetf16 = true;
5159 }
5160 }
5161
5162 if (IsTargetf32 && !IsTargetf16) {
5163 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5164 }
5165
5166 MVT CastVT =
5168 InVT.getVectorNumElements());
5169 if (IsStrict) {
5170 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5171 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5172 {In.getValue(1), In.getValue(0),
5173 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5174 }
5175 In = DAG.getNode(Opc, DL, CastVT, In);
5176 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5177 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5178 }
5179
5180 if (VTSize > InVTSize) {
5181 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5182 EVT CastVT = VT.changeVectorElementTypeToInteger();
5183 In = DAG.getNode(CastOpc, DL, CastVT, In);
5184 if (IsStrict)
5185 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5186 return DAG.getNode(Opc, DL, VT, In);
5187 }
5188
5189 // Use a scalar operation for conversions between single-element vectors of
5190 // the same size.
5191 if (VT.getVectorNumElements() == 1) {
5192 SDValue Extract =
5194 DAG.getConstant(0, DL, MVT::i64));
5195 EVT ScalarVT = VT.getScalarType();
5196 if (IsStrict)
5197 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5198 {Op.getOperand(0), Extract});
5199 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5200 }
5201
5202 return Op;
5203}
5204
5205SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5206 SelectionDAG &DAG) const {
5207 if (Op.getValueType().isVector())
5208 return LowerVectorINT_TO_FP(Op, DAG);
5209
5210 bool IsStrict = Op->isStrictFPOpcode();
5211 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5212
5213 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5214 Op->getOpcode() == ISD::SINT_TO_FP;
5215
5216 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5217 SDLoc DL(Op);
5218 if (IsStrict) {
5219 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5220 {Op.getOperand(0), SrcVal});
5221 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5222 {Op.getValueType(), MVT::Other},
5223 {Val.getValue(1), Val.getValue(0),
5224 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5225 }
5226 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5227 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5228 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5229 };
5230
5231 if (Op.getValueType() == MVT::bf16) {
5232 unsigned MaxWidth = IsSigned
5233 ? DAG.ComputeMaxSignificantBits(SrcVal)
5234 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5235 // bf16 conversions are promoted to f32 when converting from i16.
5236 if (MaxWidth <= 24) {
5237 return IntToFpViaPromotion(MVT::f32);
5238 }
5239
5240 // bf16 conversions are promoted to f64 when converting from i32.
5241 if (MaxWidth <= 53) {
5242 return IntToFpViaPromotion(MVT::f64);
5243 }
5244
5245 // We need to be careful about i64 -> bf16.
5246 // Consider an i32 22216703.
5247 // This number cannot be represented exactly as an f32 and so a itofp will
5248 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5249 // However, the correct bf16 was supposed to be 22151168.0
5250 // We need to use sticky rounding to get this correct.
5251 if (SrcVal.getValueType() == MVT::i64) {
5252 SDLoc DL(Op);
5253 // This algorithm is equivalent to the following:
5254 // uint64_t SrcHi = SrcVal & ~0xfffull;
5255 // uint64_t SrcLo = SrcVal & 0xfffull;
5256 // uint64_t Highest = SrcVal >> 53;
5257 // bool HasHighest = Highest != 0;
5258 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5259 // double Rounded = static_cast<double>(ToRound);
5260 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5261 // uint64_t HasLo = SrcLo != 0;
5262 // bool NeedsAdjustment = HasHighest & HasLo;
5263 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5264 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5265 // return static_cast<__bf16>(Adjusted);
5266 //
5267 // Essentially, what happens is that SrcVal either fits perfectly in a
5268 // double-precision value or it is too big. If it is sufficiently small,
5269 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5270 // ensure that u64 -> double has no rounding error by only using the 52
5271 // MSB of the input. The low order bits will get merged into a sticky bit
5272 // which will avoid issues incurred by double rounding.
5273
5274 // Signed conversion is more or less like so:
5275 // copysign((__bf16)abs(SrcVal), SrcVal)
5276 SDValue SignBit;
5277 if (IsSigned) {
5278 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5279 DAG.getConstant(1ull << 63, DL, MVT::i64));
5280 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5281 }
5282 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5283 DAG.getConstant(~0xfffull, DL, MVT::i64));
5284 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5285 DAG.getConstant(0xfffull, DL, MVT::i64));
5287 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5288 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5289 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5290 SDValue ToRound =
5291 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5292 SDValue Rounded =
5293 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5294 {Op.getOperand(0), ToRound})
5295 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5296
5297 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5298 if (SignBit) {
5299 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5300 }
5301
5302 SDValue HasHighest = DAG.getSetCC(
5303 DL,
5304 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5305 Highest, Zero64, ISD::SETNE);
5306
5307 SDValue HasLo = DAG.getSetCC(
5308 DL,
5309 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5310 SrcLo, Zero64, ISD::SETNE);
5311
5312 SDValue NeedsAdjustment =
5313 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5314 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5315
5316 SDValue AdjustedBits =
5317 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5318 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5319 return IsStrict
5320 ? DAG.getNode(
5322 {Op.getValueType(), MVT::Other},
5323 {Rounded.getValue(1), Adjusted,
5324 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5325 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5326 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5327 }
5328 }
5329
5330 // f16 conversions are promoted to f32 when full fp16 is not supported.
5331 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5332 return IntToFpViaPromotion(MVT::f32);
5333 }
5334
5335 // i128 conversions are libcalls.
5336 if (SrcVal.getValueType() == MVT::i128)
5337 return SDValue();
5338
5339 // Other conversions are legal, unless it's to the completely software-based
5340 // fp128.
5341 if (Op.getValueType() != MVT::f128)
5342 return Op;
5343 return SDValue();
5344}
5345
5346SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5347 SelectionDAG &DAG) const {
5348 // For iOS, we want to call an alternative entry point: __sincos_stret,
5349 // which returns the values in two S / D registers.
5350 SDLoc DL(Op);
5351 SDValue Arg = Op.getOperand(0);
5352 EVT ArgVT = Arg.getValueType();
5353 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5354
5356 Args.emplace_back(Arg, ArgTy);
5357
5358 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5359 : RTLIB::SINCOS_STRET_F32;
5360 const char *LibcallName = getLibcallName(LC);
5361 SDValue Callee =
5362 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5363
5364 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5365 TargetLowering::CallLoweringInfo CLI(DAG);
5367 CLI.setDebugLoc(DL)
5368 .setChain(DAG.getEntryNode())
5369 .setLibCallee(CC, RetTy, Callee, std::move(Args));
5370
5371 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5372 return CallResult.first;
5373}
5374
5375static MVT getSVEContainerType(EVT ContentTy);
5376
5377SDValue
5378AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5379 SelectionDAG &DAG) const {
5380 SDLoc DL(Op);
5381 uint64_t EltSize = Op.getConstantOperandVal(2);
5382 EVT VT = Op.getValueType();
5383 switch (EltSize) {
5384 case 1:
5385 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5386 return SDValue();
5387 break;
5388 case 2:
5389 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5390 return SDValue();
5391 break;
5392 case 4:
5393 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5394 return SDValue();
5395 break;
5396 case 8:
5397 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5398 return SDValue();
5399 break;
5400 default:
5401 // Other element sizes are incompatible with whilewr/rw, so expand instead
5402 return SDValue();
5403 }
5404
5405 SDValue PtrA = Op.getOperand(0);
5406 SDValue PtrB = Op.getOperand(1);
5407
5408 if (VT.isScalableVT())
5409 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
5410
5411 // We can use the SVE whilewr/whilerw instruction to lower this
5412 // intrinsic by creating the appropriate sequence of scalable vector
5413 // operations and then extracting a fixed-width subvector from the scalable
5414 // vector. Scalable vector variants are already legal.
5415 EVT ContainerVT =
5417 VT.getVectorNumElements(), true);
5418 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5419
5420 SDValue Mask =
5421 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
5422 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5423 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5424 DAG.getVectorIdxConstant(0, DL));
5425}
5426
5427SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5428 SelectionDAG &DAG) const {
5429 EVT OpVT = Op.getValueType();
5430 EVT ArgVT = Op.getOperand(0).getValueType();
5431
5433 return LowerFixedLengthBitcastToSVE(Op, DAG);
5434
5435 if (OpVT.isScalableVector()) {
5436 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5437
5438 // Handle type legalisation first.
5439 if (!isTypeLegal(ArgVT)) {
5440 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5441 "Expected int->fp bitcast!");
5442
5443 // Bitcasting between unpacked vector types of different element counts is
5444 // not a NOP because the live elements are laid out differently.
5445 // 01234567
5446 // e.g. nxv2i32 = XX??XX??
5447 // nxv4f16 = X?X?X?X?
5448 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5449 return SDValue();
5450
5451 SDValue ExtResult =
5452 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5453 Op.getOperand(0));
5454 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5455 }
5456
5457 // Bitcasts between legal types with the same element count are legal.
5458 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5459 return Op;
5460
5461 // getSVESafeBitCast does not support casting between unpacked types.
5462 if (!isPackedVectorType(OpVT, DAG))
5463 return SDValue();
5464
5465 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5466 }
5467
5468 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5469 return SDValue();
5470
5471 // Bitcasts between f16 and bf16 are legal.
5472 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5473 return Op;
5474
5475 assert(ArgVT == MVT::i16);
5476 SDLoc DL(Op);
5477
5478 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5479 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5480 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5481}
5482
5483// Returns lane if Op extracts from a two-element vector and lane is constant
5484// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5485static std::optional<uint64_t>
5487 SDNode *OpNode = Op.getNode();
5488 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5489 return std::nullopt;
5490
5491 EVT VT = OpNode->getOperand(0).getValueType();
5493 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5494 return std::nullopt;
5495
5496 return C->getZExtValue();
5497}
5498
5500 bool isSigned) {
5501 EVT VT = N.getValueType();
5502
5503 if (N.getOpcode() != ISD::BUILD_VECTOR)
5504 return false;
5505
5506 for (const SDValue &Elt : N->op_values()) {
5508 unsigned EltSize = VT.getScalarSizeInBits();
5509 unsigned HalfSize = EltSize / 2;
5510 if (isSigned) {
5511 if (!isIntN(HalfSize, C->getSExtValue()))
5512 return false;
5513 } else {
5514 if (!isUIntN(HalfSize, C->getZExtValue()))
5515 return false;
5516 }
5517 continue;
5518 }
5519 return false;
5520 }
5521
5522 return true;
5523}
5524
5526 EVT VT = N.getValueType();
5527 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5528 EVT HalfVT = EVT::getVectorVT(
5529 *DAG.getContext(),
5532 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5533}
5534
5536 return N.getOpcode() == ISD::SIGN_EXTEND ||
5537 N.getOpcode() == ISD::ANY_EXTEND ||
5538 isExtendedBUILD_VECTOR(N, DAG, true);
5539}
5540
5542 return N.getOpcode() == ISD::ZERO_EXTEND ||
5543 N.getOpcode() == ISD::ANY_EXTEND ||
5544 isExtendedBUILD_VECTOR(N, DAG, false);
5545}
5546
5548 unsigned Opcode = N.getOpcode();
5549 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5550 SDValue N0 = N.getOperand(0);
5551 SDValue N1 = N.getOperand(1);
5552 return N0->hasOneUse() && N1->hasOneUse() &&
5553 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5554 }
5555 return false;
5556}
5557
5559 unsigned Opcode = N.getOpcode();
5560 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5561 SDValue N0 = N.getOperand(0);
5562 SDValue N1 = N.getOperand(1);
5563 return N0->hasOneUse() && N1->hasOneUse() &&
5564 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5565 }
5566 return false;
5567}
5568
5569SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5570 SelectionDAG &DAG) const {
5571 // The rounding mode is in bits 23:22 of the FPSCR.
5572 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5573 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5574 // so that the shift + and get folded into a bitfield extract.
5575 SDLoc DL(Op);
5576
5577 SDValue Chain = Op.getOperand(0);
5578 SDValue FPCR_64 = DAG.getNode(
5579 ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5580 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
5581 Chain = FPCR_64.getValue(1);
5582 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5583 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5584 DAG.getConstant(1U << 22, DL, MVT::i32));
5585 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5586 DAG.getConstant(22, DL, MVT::i32));
5587 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5588 DAG.getConstant(3, DL, MVT::i32));
5589 return DAG.getMergeValues({AND, Chain}, DL);
5590}
5591
5592SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5593 SelectionDAG &DAG) const {
5594 SDLoc DL(Op);
5595 SDValue Chain = Op->getOperand(0);
5596 SDValue RMValue = Op->getOperand(1);
5597
5598 // The rounding mode is in bits 23:22 of the FPCR.
5599 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5600 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5601 // ((arg - 1) & 3) << 22).
5602 //
5603 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5604 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5605 // generated llvm.set.rounding to ensure this condition.
5606
5607 // Calculate new value of FPCR[23:22].
5608 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5609 DAG.getConstant(1, DL, MVT::i32));
5610 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5611 DAG.getConstant(0x3, DL, MVT::i32));
5612 RMValue =
5613 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5614 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5615 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5616
5617 // Get current value of FPCR.
5618 SDValue Ops[] = {
5619 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5620 SDValue FPCR =
5621 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5622 Chain = FPCR.getValue(1);
5623 FPCR = FPCR.getValue(0);
5624
5625 // Put new rounding mode into FPSCR[23:22].
5626 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5627 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5628 DAG.getConstant(RMMask, DL, MVT::i64));
5629 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5630 SDValue Ops2[] = {
5631 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5632 FPCR};
5633 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5634}
5635
5636SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5637 SelectionDAG &DAG) const {
5638 SDLoc DL(Op);
5639 SDValue Chain = Op->getOperand(0);
5640
5641 // Get current value of FPCR.
5642 SDValue Ops[] = {
5643 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5644 SDValue FPCR =
5645 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5646 Chain = FPCR.getValue(1);
5647 FPCR = FPCR.getValue(0);
5648
5649 // Truncate FPCR to 32 bits.
5650 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5651
5652 return DAG.getMergeValues({Result, Chain}, DL);
5653}
5654
5655SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5656 SelectionDAG &DAG) const {
5657 SDLoc DL(Op);
5658 SDValue Chain = Op->getOperand(0);
5659 SDValue Mode = Op->getOperand(1);
5660
5661 // Extend the specified value to 64 bits.
5662 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5663
5664 // Set new value of FPCR.
5665 SDValue Ops2[] = {
5666 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5667 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5668}
5669
5670SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5671 SelectionDAG &DAG) const {
5672 SDLoc DL(Op);
5673 SDValue Chain = Op->getOperand(0);
5674
5675 // Get current value of FPCR.
5676 SDValue Ops[] = {
5677 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5678 SDValue FPCR =
5679 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5680 Chain = FPCR.getValue(1);
5681 FPCR = FPCR.getValue(0);
5682
5683 // Clear bits that are not reserved.
5684 SDValue FPSCRMasked = DAG.getNode(
5685 ISD::AND, DL, MVT::i64, FPCR,
5687
5688 // Set new value of FPCR.
5689 SDValue Ops2[] = {Chain,
5690 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5691 FPSCRMasked};
5692 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5693}
5694
5695static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5696 SDLoc DL, bool &IsMLA) {
5697 bool IsN0SExt = isSignExtended(N0, DAG);
5698 bool IsN1SExt = isSignExtended(N1, DAG);
5699 if (IsN0SExt && IsN1SExt)
5700 return AArch64ISD::SMULL;
5701
5702 bool IsN0ZExt = isZeroExtended(N0, DAG);
5703 bool IsN1ZExt = isZeroExtended(N1, DAG);
5704
5705 if (IsN0ZExt && IsN1ZExt)
5706 return AArch64ISD::UMULL;
5707
5708 // Select UMULL if we can replace the other operand with an extend.
5709 EVT VT = N0.getValueType();
5710 unsigned EltSize = VT.getScalarSizeInBits();
5711 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5712 if (IsN0ZExt || IsN1ZExt) {
5713 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5714 return AArch64ISD::UMULL;
5715 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5716 DAG.MaskedValueIsZero(N1, Mask)) {
5717 // For v2i64 we look more aggressively at both operands being zero, to avoid
5718 // scalarization.
5719 return AArch64ISD::UMULL;
5720 }
5721
5722 if (IsN0SExt || IsN1SExt) {
5723 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5724 return AArch64ISD::SMULL;
5725 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5726 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5727 return AArch64ISD::SMULL;
5728 }
5729
5730 if (!IsN1SExt && !IsN1ZExt)
5731 return 0;
5732
5733 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5734 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5735 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5736 IsMLA = true;
5737 return AArch64ISD::SMULL;
5738 }
5739 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5740 IsMLA = true;
5741 return AArch64ISD::UMULL;
5742 }
5743 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5744 std::swap(N0, N1);
5745 IsMLA = true;
5746 return AArch64ISD::UMULL;
5747 }
5748 return 0;
5749}
5750
5751SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5752 EVT VT = Op.getValueType();
5753
5754 bool OverrideNEON = !Subtarget->isNeonAvailable();
5755 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5756 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5757
5758 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5759 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5760 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5761 "unexpected type for custom-lowering ISD::MUL");
5762 SDValue N0 = Op.getOperand(0);
5763 SDValue N1 = Op.getOperand(1);
5764 bool isMLA = false;
5765 EVT OVT = VT;
5766 if (VT.is64BitVector()) {
5767 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5768 isNullConstant(N0.getOperand(1)) &&
5770 isNullConstant(N1.getOperand(1))) {
5771 N0 = N0.getOperand(0);
5772 N1 = N1.getOperand(0);
5773 VT = N0.getValueType();
5774 } else {
5775 if (VT == MVT::v1i64) {
5776 if (Subtarget->hasSVE())
5777 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5778 // Fall through to expand this. It is not legal.
5779 return SDValue();
5780 } else
5781 // Other vector multiplications are legal.
5782 return Op;
5783 }
5784 }
5785
5786 SDLoc DL(Op);
5787 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5788
5789 if (!NewOpc) {
5790 if (VT.getVectorElementType() == MVT::i64) {
5791 // If SVE is available then i64 vector multiplications can also be made
5792 // legal.
5793 if (Subtarget->hasSVE())
5794 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5795 // Fall through to expand this. It is not legal.
5796 return SDValue();
5797 } else
5798 // Other vector multiplications are legal.
5799 return Op;
5800 }
5801
5802 // Legalize to a S/UMULL instruction
5803 SDValue Op0;
5804 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5805 if (!isMLA) {
5806 Op0 = skipExtensionForVectorMULL(N0, DAG);
5808 Op1.getValueType().is64BitVector() &&
5809 "unexpected types for extended operands to VMULL");
5810 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5811 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5812 DAG.getConstant(0, DL, MVT::i64));
5813 }
5814 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5815 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5816 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5819 EVT Op1VT = Op1.getValueType();
5820 return DAG.getNode(
5822 DAG.getNode(N0.getOpcode(), DL, VT,
5823 DAG.getNode(NewOpc, DL, VT,
5824 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5825 DAG.getNode(NewOpc, DL, VT,
5826 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5827 DAG.getConstant(0, DL, MVT::i64));
5828}
5829
5830static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5831 int Pattern) {
5832 if (Pattern == AArch64SVEPredPattern::all)
5833 return DAG.getConstant(1, DL, VT);
5834 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5835 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5836}
5837
5839 bool IsSigned, bool IsEqual) {
5840 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5841 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5842
5843 if (!N->getValueType(0).isScalableVector() ||
5844 !isa<ConstantSDNode>(N->getOperand(Op1)))
5845 return SDValue();
5846
5847 SDLoc DL(N);
5848 APInt Y = N->getConstantOperandAPInt(Op1);
5849
5850 // When the second operand is the maximum value, comparisons that include
5851 // equality can never fail and thus we can return an all active predicate.
5852 if (IsEqual)
5853 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5854 return DAG.getConstant(1, DL, N->getValueType(0));
5855
5856 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5857 return SDValue();
5858
5859 APInt X = N->getConstantOperandAPInt(Op0);
5860
5861 bool Overflow;
5862 APInt NumActiveElems =
5863 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5864
5865 if (Overflow)
5866 return SDValue();
5867
5868 if (IsEqual) {
5869 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5870 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5871 : NumActiveElems.uadd_ov(One, Overflow);
5872 if (Overflow)
5873 return SDValue();
5874 }
5875
5876 std::optional<unsigned> PredPattern =
5878 unsigned MinSVEVectorSize = std::max(
5880 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5881 if (PredPattern != std::nullopt &&
5882 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5883 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5884
5885 return SDValue();
5886}
5887
5888// Returns a safe bitcast between two scalable vector predicates, where
5889// any newly created lanes from a widening bitcast are defined as zero.
5891 SDLoc DL(Op);
5892 EVT InVT = Op.getValueType();
5893
5894 assert(InVT.getVectorElementType() == MVT::i1 &&
5895 VT.getVectorElementType() == MVT::i1 &&
5896 "Expected a predicate-to-predicate bitcast");
5898 InVT.isScalableVector() &&
5899 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5900 "Only expect to cast between legal scalable predicate types!");
5901
5902 // Return the operand if the cast isn't changing type,
5903 if (InVT == VT)
5904 return Op;
5905
5906 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5907 // than VT. This will increase the chances of removing casts that introduce
5908 // new lanes, which have to be explicitly zero'd.
5909 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5910 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5911 Op.getOperand(1).getValueType().bitsGT(VT))
5912 Op = Op.getOperand(1);
5913
5914 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5915
5916 // We only have to zero the lanes if new lanes are being defined, e.g. when
5917 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5918 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5919 // we can return here.
5920 if (InVT.bitsGT(VT))
5921 return Reinterpret;
5922
5923 // Check if the other lanes are already known to be zeroed by
5924 // construction.
5926 return Reinterpret;
5927
5928 // Zero the newly introduced lanes.
5929 SDValue Mask = DAG.getConstant(1, DL, InVT);
5930 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5931 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5932}
5933
5934SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5935 SDValue Chain, SDLoc DL,
5936 EVT VT) const {
5937 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5940 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5941 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5942 TargetLowering::CallLoweringInfo CLI(DAG);
5944 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5945 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5946 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5947 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5948 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5949 Mask);
5950}
5951
5952// Lower an SME LDR/STR ZA intrinsic
5953// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5954// folded into the instruction
5955// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5956// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5957// and tile slice registers
5958// ldr(%tileslice, %ptr, %vecnum)
5959// ->
5960// %svl = rdsvl
5961// %ptr2 = %ptr + %svl * %vecnum
5962// %tileslice2 = %tileslice + %vecnum
5963// ldr [%tileslice2, 0], [%ptr2, 0]
5964// Case 3: If the vecnum is an immediate out of range, then the same is done as
5965// case 2, but the base and slice registers are modified by the greatest
5966// multiple of 15 lower than the vecnum and the remainder is folded into the
5967// instruction. This means that successive loads and stores that are offset from
5968// each other can share the same base and slice register updates.
5969// ldr(%tileslice, %ptr, 22)
5970// ldr(%tileslice, %ptr, 23)
5971// ->
5972// %svl = rdsvl
5973// %ptr2 = %ptr + %svl * 15
5974// %tileslice2 = %tileslice + 15
5975// ldr [%tileslice2, 7], [%ptr2, 7]
5976// ldr [%tileslice2, 8], [%ptr2, 8]
5977// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5978// operand and the immediate can be folded into the instruction, like case 2.
5979// ldr(%tileslice, %ptr, %vecnum + 7)
5980// ldr(%tileslice, %ptr, %vecnum + 8)
5981// ->
5982// %svl = rdsvl
5983// %ptr2 = %ptr + %svl * %vecnum
5984// %tileslice2 = %tileslice + %vecnum
5985// ldr [%tileslice2, 7], [%ptr2, 7]
5986// ldr [%tileslice2, 8], [%ptr2, 8]
5987// Case 5: The vecnum being an add of an immediate out of range is also handled,
5988// in which case the same remainder logic as case 3 is used.
5990 SDLoc DL(N);
5991
5992 SDValue TileSlice = N->getOperand(2);
5993 SDValue Base = N->getOperand(3);
5994 SDValue VecNum = N->getOperand(4);
5995 int32_t ConstAddend = 0;
5996 SDValue VarAddend = VecNum;
5997
5998 // If the vnum is an add of an immediate, we can fold it into the instruction
5999 if (VecNum.getOpcode() == ISD::ADD &&
6000 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6001 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6002 VarAddend = VecNum.getOperand(0);
6003 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6004 ConstAddend = ImmNode->getSExtValue();
6005 VarAddend = SDValue();
6006 }
6007
6008 int32_t ImmAddend = ConstAddend % 16;
6009 if (int32_t C = (ConstAddend - ImmAddend)) {
6010 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6011 VarAddend = VarAddend
6012 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6013 : CVal;
6014 }
6015
6016 if (VarAddend) {
6017 // Get the vector length that will be multiplied by vnum
6018 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6019 DAG.getConstant(1, DL, MVT::i32));
6020
6021 // Multiply SVL and vnum then add it to the base
6022 SDValue Mul = DAG.getNode(
6023 ISD::MUL, DL, MVT::i64,
6024 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6025 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6026 // Just add vnum to the tileslice
6027 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6028 }
6029
6030 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6031 DL, MVT::Other,
6032 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6033 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6034}
6035
6037 SDLoc DL(Op);
6038 SDValue ID =
6039 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6040
6041 auto Op1 = Op.getOperand(1);
6042 auto Op2 = Op.getOperand(2);
6043 auto Mask = Op.getOperand(3);
6044
6045 EVT Op1VT = Op1.getValueType();
6046 EVT Op2VT = Op2.getValueType();
6047 EVT ResVT = Op.getValueType();
6048
6049 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6050 Op1VT.getVectorElementType() == MVT::i16) &&
6051 "Expected 8-bit or 16-bit characters.");
6052
6053 // Scalable vector type used to wrap operands.
6054 // A single container is enough for both operands because ultimately the
6055 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6056 EVT OpContainerVT = Op1VT.isScalableVector()
6057 ? Op1VT
6059
6060 if (Op2VT.is128BitVector()) {
6061 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6062 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6063 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6064 if (ResVT.isScalableVector())
6065 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6066 DAG.getTargetConstant(0, DL, MVT::i64));
6067 } else {
6068 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6069 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6070 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6071 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6072 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6073 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6074 DAG.getConstant(0, DL, MVT::i64));
6075 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6076 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6077 }
6078
6079 // If the result is scalable, we just need to carry out the MATCH.
6080 if (ResVT.isScalableVector())
6081 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6082
6083 // If the result is fixed, we can still use MATCH but we need to wrap the
6084 // first operand and the mask in scalable vectors before doing so.
6085
6086 // Wrap the operands.
6087 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6088 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6089 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6090
6091 // Carry out the match.
6092 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6093 ID, Mask, Op1, Op2);
6094
6095 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6096 // (v16i8/v8i8).
6097 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6098 Match = convertFromScalableVector(DAG, Op1VT, Match);
6099 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6100}
6101
6102SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6103 SelectionDAG &DAG) const {
6104 unsigned IntNo = Op.getConstantOperandVal(1);
6105 SDLoc DL(Op);
6106 switch (IntNo) {
6107 default:
6108 return SDValue(); // Don't custom lower most intrinsics.
6109 case Intrinsic::aarch64_prefetch: {
6110 SDValue Chain = Op.getOperand(0);
6111 SDValue Addr = Op.getOperand(2);
6112
6113 unsigned IsWrite = Op.getConstantOperandVal(3);
6114 unsigned Locality = Op.getConstantOperandVal(4);
6115 unsigned IsStream = Op.getConstantOperandVal(5);
6116 unsigned IsData = Op.getConstantOperandVal(6);
6117 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6118 (!IsData << 3) | // IsDataCache bit
6119 (Locality << 1) | // Cache level bits
6120 (unsigned)IsStream; // Stream bit
6121
6122 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6123 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6124 }
6125 case Intrinsic::aarch64_sme_str:
6126 case Intrinsic::aarch64_sme_ldr: {
6127 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6128 }
6129 case Intrinsic::aarch64_sme_za_enable:
6130 return DAG.getNode(
6131 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6132 Op->getOperand(0), // Chain
6133 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6134 case Intrinsic::aarch64_sme_za_disable:
6135 return DAG.getNode(
6136 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6137 Op->getOperand(0), // Chain
6138 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6139 }
6140}
6141
6142SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6143 SelectionDAG &DAG) const {
6144 unsigned IntNo = Op.getConstantOperandVal(1);
6145 SDLoc DL(Op);
6146 switch (IntNo) {
6147 default:
6148 return SDValue(); // Don't custom lower most intrinsics.
6149 case Intrinsic::aarch64_mops_memset_tag: {
6150 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6151 SDValue Chain = Node->getChain();
6152 SDValue Dst = Op.getOperand(2);
6153 SDValue Val = Op.getOperand(3);
6154 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6155 SDValue Size = Op.getOperand(4);
6156 auto Alignment = Node->getMemOperand()->getAlign();
6157 bool IsVol = Node->isVolatile();
6158 auto DstPtrInfo = Node->getPointerInfo();
6159
6160 const auto &SDI =
6161 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6162 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6163 Chain, Dst, Val, Size, Alignment, IsVol,
6164 DstPtrInfo, MachinePointerInfo{});
6165
6166 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6167 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6168 // LowerOperationWrapper will complain that the number of results has
6169 // changed.
6170 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6171 }
6172 }
6173}
6174
6175SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6176 SelectionDAG &DAG) const {
6177 unsigned IntNo = Op.getConstantOperandVal(0);
6178 SDLoc DL(Op);
6179 switch (IntNo) {
6180 default: return SDValue(); // Don't custom lower most intrinsics.
6181 case Intrinsic::thread_pointer: {
6182 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6183 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6184 }
6185 case Intrinsic::aarch64_sve_whilewr_b:
6186 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6187 Op.getOperand(1), Op.getOperand(2),
6188 DAG.getConstant(1, DL, MVT::i64));
6189 case Intrinsic::aarch64_sve_whilewr_h:
6190 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6191 Op.getOperand(1), Op.getOperand(2),
6192 DAG.getConstant(2, DL, MVT::i64));
6193 case Intrinsic::aarch64_sve_whilewr_s:
6194 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6195 Op.getOperand(1), Op.getOperand(2),
6196 DAG.getConstant(4, DL, MVT::i64));
6197 case Intrinsic::aarch64_sve_whilewr_d:
6198 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6199 Op.getOperand(1), Op.getOperand(2),
6200 DAG.getConstant(8, DL, MVT::i64));
6201 case Intrinsic::aarch64_sve_whilerw_b:
6202 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6203 Op.getOperand(1), Op.getOperand(2),
6204 DAG.getConstant(1, DL, MVT::i64));
6205 case Intrinsic::aarch64_sve_whilerw_h:
6206 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6207 Op.getOperand(1), Op.getOperand(2),
6208 DAG.getConstant(2, DL, MVT::i64));
6209 case Intrinsic::aarch64_sve_whilerw_s:
6210 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6211 Op.getOperand(1), Op.getOperand(2),
6212 DAG.getConstant(4, DL, MVT::i64));
6213 case Intrinsic::aarch64_sve_whilerw_d:
6214 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6215 Op.getOperand(1), Op.getOperand(2),
6216 DAG.getConstant(8, DL, MVT::i64));
6217 case Intrinsic::aarch64_neon_abs: {
6218 EVT Ty = Op.getValueType();
6219 if (Ty == MVT::i64) {
6220 SDValue Result =
6221 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6222 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6223 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6224 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6225 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6226 } else {
6227 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6228 }
6229 }
6230 case Intrinsic::aarch64_neon_pmull64: {
6231 SDValue LHS = Op.getOperand(1);
6232 SDValue RHS = Op.getOperand(2);
6233
6234 std::optional<uint64_t> LHSLane =
6236 std::optional<uint64_t> RHSLane =
6238
6239 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6240 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6241
6242 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6243 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6244 // which ISel recognizes better. For example, generate a ldr into d*
6245 // registers as opposed to a GPR load followed by a fmov.
6246 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6247 std::optional<uint64_t> OtherLane,
6248 const SDLoc &DL,
6249 SelectionDAG &DAG) -> SDValue {
6250 // If the operand is an higher half itself, rewrite it to
6251 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6252 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6253 if (NLane == 1)
6254 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6255 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6256
6257 // Operand N is not a higher half but the other operand is.
6258 if (OtherLane == 1) {
6259 // If this operand is a lower half, rewrite it to
6260 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6261 // align lanes of two operands. A roundtrip sequence (to move from lane
6262 // 1 to lane 0) is like this:
6263 // mov x8, v0.d[1]
6264 // fmov d0, x8
6265 if (NLane == 0)
6266 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6267 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6268 N.getOperand(0),
6269 DAG.getConstant(0, DL, MVT::i64)),
6270 DAG.getConstant(1, DL, MVT::i64));
6271
6272 // Otherwise just dup from main to all lanes.
6273 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6274 }
6275
6276 // Neither operand is an extract of higher half, so codegen may just use
6277 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6278 assert(N.getValueType() == MVT::i64 &&
6279 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6280 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6281 };
6282
6283 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6284 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6285
6286 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6287 }
6288 case Intrinsic::aarch64_neon_smax:
6289 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6290 Op.getOperand(2));
6291 case Intrinsic::aarch64_neon_umax:
6292 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6293 Op.getOperand(2));
6294 case Intrinsic::aarch64_neon_smin:
6295 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6296 Op.getOperand(2));
6297 case Intrinsic::aarch64_neon_umin:
6298 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6299 Op.getOperand(2));
6300 case Intrinsic::aarch64_neon_scalar_sqxtn:
6301 case Intrinsic::aarch64_neon_scalar_sqxtun:
6302 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6303 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6304 if (Op.getValueType() == MVT::i32)
6305 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6306 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6307 Op.getOperand(0),
6308 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6309 Op.getOperand(1))));
6310 return SDValue();
6311 }
6312 case Intrinsic::aarch64_neon_sqxtn:
6313 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6314 Op.getOperand(1));
6315 case Intrinsic::aarch64_neon_sqxtun:
6316 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6317 Op.getOperand(1));
6318 case Intrinsic::aarch64_neon_uqxtn:
6319 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6320 Op.getOperand(1));
6321 case Intrinsic::aarch64_neon_sqshrn:
6322 if (Op.getValueType().isVector())
6323 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6324 DAG.getNode(AArch64ISD::VASHR, DL,
6325 Op.getOperand(1).getValueType(),
6326 Op.getOperand(1), Op.getOperand(2)));
6327 return SDValue();
6328 case Intrinsic::aarch64_neon_sqshrun:
6329 if (Op.getValueType().isVector())
6330 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6331 DAG.getNode(AArch64ISD::VASHR, DL,
6332 Op.getOperand(1).getValueType(),
6333 Op.getOperand(1), Op.getOperand(2)));
6334 return SDValue();
6335 case Intrinsic::aarch64_neon_uqshrn:
6336 if (Op.getValueType().isVector())
6337 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6338 DAG.getNode(AArch64ISD::VLSHR, DL,
6339 Op.getOperand(1).getValueType(),
6340 Op.getOperand(1), Op.getOperand(2)));
6341 return SDValue();
6342 case Intrinsic::aarch64_neon_sqrshrn:
6343 if (Op.getValueType().isVector())
6344 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6345 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6346 Op.getOperand(1).getValueType(),
6347 Op.getOperand(1), Op.getOperand(2)));
6348 return SDValue();
6349 case Intrinsic::aarch64_neon_sqrshrun:
6350 if (Op.getValueType().isVector())
6351 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6352 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6353 Op.getOperand(1).getValueType(),
6354 Op.getOperand(1), Op.getOperand(2)));
6355 return SDValue();
6356 case Intrinsic::aarch64_neon_uqrshrn:
6357 if (Op.getValueType().isVector())
6358 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6359 DAG.getNode(AArch64ISD::URSHR_I, DL,
6360 Op.getOperand(1).getValueType(),
6361 Op.getOperand(1), Op.getOperand(2)));
6362 return SDValue();
6363 case Intrinsic::aarch64_neon_sqadd:
6364 if (Op.getValueType().isVector())
6365 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6366 Op.getOperand(2));
6367 return SDValue();
6368 case Intrinsic::aarch64_neon_sqsub:
6369 if (Op.getValueType().isVector())
6370 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6371 Op.getOperand(2));
6372 return SDValue();
6373 case Intrinsic::aarch64_neon_uqadd:
6374 if (Op.getValueType().isVector())
6375 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6376 Op.getOperand(2));
6377 return SDValue();
6378 case Intrinsic::aarch64_neon_uqsub:
6379 if (Op.getValueType().isVector())
6380 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6381 Op.getOperand(2));
6382 return SDValue();
6383 case Intrinsic::aarch64_sve_whilelt:
6384 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6385 /*IsEqual=*/false);
6386 case Intrinsic::aarch64_sve_whilels:
6387 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6388 /*IsEqual=*/true);
6389 case Intrinsic::aarch64_sve_whilele:
6390 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6391 /*IsEqual=*/true);
6392 case Intrinsic::aarch64_sve_sunpkhi:
6393 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6394 Op.getOperand(1));
6395 case Intrinsic::aarch64_sve_sunpklo:
6396 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6397 Op.getOperand(1));
6398 case Intrinsic::aarch64_sve_uunpkhi:
6399 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6400 Op.getOperand(1));
6401 case Intrinsic::aarch64_sve_uunpklo:
6402 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6403 Op.getOperand(1));
6404 case Intrinsic::aarch64_sve_clasta_n:
6405 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6406 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6407 case Intrinsic::aarch64_sve_clastb_n:
6408 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6409 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6410 case Intrinsic::aarch64_sve_lasta:
6411 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6412 Op.getOperand(1), Op.getOperand(2));
6413 case Intrinsic::aarch64_sve_lastb:
6414 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6415 Op.getOperand(1), Op.getOperand(2));
6416 case Intrinsic::aarch64_sve_rev:
6417 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6418 Op.getOperand(1));
6419 case Intrinsic::aarch64_sve_tbl:
6420 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6421 Op.getOperand(2));
6422 case Intrinsic::aarch64_sve_trn1:
6423 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6424 Op.getOperand(1), Op.getOperand(2));
6425 case Intrinsic::aarch64_sve_trn2:
6426 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6427 Op.getOperand(1), Op.getOperand(2));
6428 case Intrinsic::aarch64_sve_uzp1:
6429 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6430 Op.getOperand(1), Op.getOperand(2));
6431 case Intrinsic::aarch64_sve_uzp2:
6432 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6433 Op.getOperand(1), Op.getOperand(2));
6434 case Intrinsic::aarch64_sve_zip1:
6435 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6436 Op.getOperand(1), Op.getOperand(2));
6437 case Intrinsic::aarch64_sve_zip2:
6438 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6439 Op.getOperand(1), Op.getOperand(2));
6440 case Intrinsic::aarch64_sve_splice:
6441 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6442 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6443 case Intrinsic::aarch64_sve_ptrue:
6444 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6445 case Intrinsic::aarch64_sve_clz:
6446 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6447 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6448 case Intrinsic::aarch64_sme_cntsd: {
6449 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6450 DAG.getConstant(1, DL, MVT::i32));
6451 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6452 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6453 }
6454 case Intrinsic::aarch64_sve_cnt: {
6455 SDValue Data = Op.getOperand(3);
6456 // CTPOP only supports integer operands.
6457 if (Data.getValueType().isFloatingPoint())
6458 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6459 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6460 Op.getOperand(2), Data, Op.getOperand(1));
6461 }
6462 case Intrinsic::aarch64_sve_dupq_lane:
6463 return LowerDUPQLane(Op, DAG);
6464 case Intrinsic::aarch64_sve_convert_from_svbool:
6465 if (Op.getValueType() == MVT::aarch64svcount)
6466 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6467 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6468 case Intrinsic::aarch64_sve_convert_to_svbool:
6469 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6470 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6471 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6472 case Intrinsic::aarch64_sve_fneg:
6473 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6474 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6475 case Intrinsic::aarch64_sve_frintp:
6476 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6477 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6478 case Intrinsic::aarch64_sve_frintm:
6479 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6480 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6481 case Intrinsic::aarch64_sve_frinti:
6482 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6483 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6484 Op.getOperand(1));
6485 case Intrinsic::aarch64_sve_frintx:
6486 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6487 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6488 case Intrinsic::aarch64_sve_frinta:
6489 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6490 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6491 case Intrinsic::aarch64_sve_frintn:
6492 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6493 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6494 Op.getOperand(1));
6495 case Intrinsic::aarch64_sve_frintz:
6496 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6497 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6498 case Intrinsic::aarch64_sve_ucvtf:
6499 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6500 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6501 Op.getOperand(1));
6502 case Intrinsic::aarch64_sve_scvtf:
6503 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6504 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6505 Op.getOperand(1));
6506 case Intrinsic::aarch64_sve_fcvtzu:
6507 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6508 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6509 case Intrinsic::aarch64_sve_fcvtzs:
6510 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6511 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6512 case Intrinsic::aarch64_sve_fsqrt:
6513 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6514 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6515 case Intrinsic::aarch64_sve_frecpx:
6516 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6517 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6518 case Intrinsic::aarch64_sve_frecpe_x:
6519 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6520 Op.getOperand(1));
6521 case Intrinsic::aarch64_sve_frecps_x:
6522 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6523 Op.getOperand(1), Op.getOperand(2));
6524 case Intrinsic::aarch64_sve_frsqrte_x:
6525 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6526 Op.getOperand(1));
6527 case Intrinsic::aarch64_sve_frsqrts_x:
6528 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6529 Op.getOperand(1), Op.getOperand(2));
6530 case Intrinsic::aarch64_sve_fabs:
6531 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6532 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6533 case Intrinsic::aarch64_sve_abs:
6534 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6535 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6536 case Intrinsic::aarch64_sve_neg:
6537 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6538 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6539 case Intrinsic::aarch64_sve_insr: {
6540 SDValue Scalar = Op.getOperand(2);
6541 EVT ScalarTy = Scalar.getValueType();
6542 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6543 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6544
6545 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6546 Op.getOperand(1), Scalar);
6547 }
6548 case Intrinsic::aarch64_sve_rbit:
6549 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6550 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6551 Op.getOperand(1));
6552 case Intrinsic::aarch64_sve_revb:
6553 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6554 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6555 case Intrinsic::aarch64_sve_revh:
6556 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6557 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6558 case Intrinsic::aarch64_sve_revw:
6559 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6560 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6561 case Intrinsic::aarch64_sve_revd:
6562 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6563 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6564 case Intrinsic::aarch64_sve_sxtb:
6565 return DAG.getNode(
6566 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6567 Op.getOperand(2), Op.getOperand(3),
6568 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6569 Op.getOperand(1));
6570 case Intrinsic::aarch64_sve_sxth:
6571 return DAG.getNode(
6572 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6573 Op.getOperand(2), Op.getOperand(3),
6574 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6575 Op.getOperand(1));
6576 case Intrinsic::aarch64_sve_sxtw:
6577 return DAG.getNode(
6578 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6579 Op.getOperand(2), Op.getOperand(3),
6580 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6581 Op.getOperand(1));
6582 case Intrinsic::aarch64_sve_uxtb:
6583 return DAG.getNode(
6584 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6585 Op.getOperand(2), Op.getOperand(3),
6586 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6587 Op.getOperand(1));
6588 case Intrinsic::aarch64_sve_uxth:
6589 return DAG.getNode(
6590 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6591 Op.getOperand(2), Op.getOperand(3),
6592 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6593 Op.getOperand(1));
6594 case Intrinsic::aarch64_sve_uxtw:
6595 return DAG.getNode(
6596 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6597 Op.getOperand(2), Op.getOperand(3),
6598 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6599 Op.getOperand(1));
6600 case Intrinsic::localaddress: {
6601 const auto &MF = DAG.getMachineFunction();
6602 const auto *RegInfo = Subtarget->getRegisterInfo();
6603 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6604 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6605 Op.getSimpleValueType());
6606 }
6607
6608 case Intrinsic::eh_recoverfp: {
6609 // FIXME: This needs to be implemented to correctly handle highly aligned
6610 // stack objects. For now we simply return the incoming FP. Refer D53541
6611 // for more details.
6612 SDValue FnOp = Op.getOperand(1);
6613 SDValue IncomingFPOp = Op.getOperand(2);
6614 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6615 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6616 if (!Fn)
6618 "llvm.eh.recoverfp must take a function as the first argument");
6619 return IncomingFPOp;
6620 }
6621 case Intrinsic::aarch64_neon_vsri:
6622 case Intrinsic::aarch64_neon_vsli:
6623 case Intrinsic::aarch64_sve_sri:
6624 case Intrinsic::aarch64_sve_sli: {
6625 EVT Ty = Op.getValueType();
6626
6627 if (!Ty.isVector())
6628 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6629
6630 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6631
6632 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6633 IntNo == Intrinsic::aarch64_sve_sri;
6634 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6635 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6636 Op.getOperand(3));
6637 }
6638
6639 case Intrinsic::aarch64_neon_srhadd:
6640 case Intrinsic::aarch64_neon_urhadd:
6641 case Intrinsic::aarch64_neon_shadd:
6642 case Intrinsic::aarch64_neon_uhadd: {
6643 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6644 IntNo == Intrinsic::aarch64_neon_shadd);
6645 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6646 IntNo == Intrinsic::aarch64_neon_urhadd);
6647 unsigned Opcode = IsSignedAdd
6648 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6649 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6650 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6651 Op.getOperand(2));
6652 }
6653 case Intrinsic::aarch64_neon_saddlp:
6654 case Intrinsic::aarch64_neon_uaddlp: {
6655 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6656 ? AArch64ISD::UADDLP
6657 : AArch64ISD::SADDLP;
6658 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6659 }
6660 case Intrinsic::aarch64_neon_sdot:
6661 case Intrinsic::aarch64_neon_udot:
6662 case Intrinsic::aarch64_sve_sdot:
6663 case Intrinsic::aarch64_sve_udot: {
6664 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6665 IntNo == Intrinsic::aarch64_sve_udot)
6666 ? AArch64ISD::UDOT
6667 : AArch64ISD::SDOT;
6668 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6669 Op.getOperand(2), Op.getOperand(3));
6670 }
6671 case Intrinsic::aarch64_neon_usdot:
6672 case Intrinsic::aarch64_sve_usdot: {
6673 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6674 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6675 }
6676 case Intrinsic::aarch64_neon_saddlv:
6677 case Intrinsic::aarch64_neon_uaddlv: {
6678 EVT OpVT = Op.getOperand(1).getValueType();
6679 EVT ResVT = Op.getValueType();
6680 assert(
6681 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6682 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6683 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6684 "Unexpected aarch64_neon_u/saddlv type");
6685 (void)OpVT;
6686 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6687 SDValue ADDLV = DAG.getNode(
6688 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6689 : AArch64ISD::SADDLV,
6690 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6691 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6692 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6693 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6694 return EXTRACT_VEC_ELT;
6695 }
6696 case Intrinsic::experimental_cttz_elts: {
6697 SDValue CttzOp = Op.getOperand(1);
6698 EVT VT = CttzOp.getValueType();
6699 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6700
6701 if (VT.isFixedLengthVector()) {
6702 // We can use SVE instructions to lower this intrinsic by first creating
6703 // an SVE predicate register mask from the fixed-width vector.
6704 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6705 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6706 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6707 }
6708
6709 SDValue NewCttzElts =
6710 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6711 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6712 }
6713 case Intrinsic::experimental_vector_match: {
6714 return LowerVectorMatch(Op, DAG);
6715 }
6716 }
6717}
6718
6719bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6720 if (VT.getVectorElementType() == MVT::i8 ||
6721 VT.getVectorElementType() == MVT::i16) {
6722 EltTy = MVT::i32;
6723 return true;
6724 }
6725 return false;
6726}
6727
6728bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6729 EVT DataVT) const {
6730 const EVT IndexVT = Extend.getOperand(0).getValueType();
6731 // SVE only supports implicit extension of 32-bit indices.
6732 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6733 return false;
6734
6735 // Indices cannot be smaller than the main data type.
6736 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6737 return false;
6738
6739 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6740 // element container type, which would violate the previous clause.
6741 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6742}
6743
6744bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6745 EVT ExtVT = ExtVal.getValueType();
6746 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6747 return false;
6748
6749 // It may be worth creating extending masked loads if there are multiple
6750 // masked loads using the same predicate. That way we'll end up creating
6751 // extending masked loads that may then get split by the legaliser. This
6752 // results in just one set of predicate unpacks at the start, instead of
6753 // multiple sets of vector unpacks after each load.
6754 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6755 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6756 // Disable extending masked loads for fixed-width for now, since the code
6757 // quality doesn't look great.
6758 if (!ExtVT.isScalableVector())
6759 return false;
6760
6761 unsigned NumExtMaskedLoads = 0;
6762 for (auto *U : Ld->getMask()->users())
6763 if (isa<MaskedLoadSDNode>(U))
6764 NumExtMaskedLoads++;
6765
6766 if (NumExtMaskedLoads <= 1)
6767 return false;
6768 }
6769 }
6770
6771 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6772 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6773 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6774}
6775
6776unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6777 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6778 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6779 AArch64ISD::GLD1_MERGE_ZERO},
6780 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6781 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6782 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6783 AArch64ISD::GLD1_MERGE_ZERO},
6784 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6785 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6786 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6787 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6788 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6789 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6790 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6791 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6792 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6793 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6794 };
6795 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6796 return AddrModes.find(Key)->second;
6797}
6798
6799unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6800 switch (Opcode) {
6801 default:
6802 llvm_unreachable("unimplemented opcode");
6803 return Opcode;
6804 case AArch64ISD::GLD1_MERGE_ZERO:
6805 return AArch64ISD::GLD1S_MERGE_ZERO;
6806 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6807 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6808 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6809 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6810 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6811 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6812 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6813 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6814 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6815 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6816 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6817 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6818 }
6819}
6820
6821SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6822 SelectionDAG &DAG) const {
6823 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6824
6825 SDLoc DL(Op);
6826 SDValue Chain = MGT->getChain();
6827 SDValue PassThru = MGT->getPassThru();
6828 SDValue Mask = MGT->getMask();
6829 SDValue BasePtr = MGT->getBasePtr();
6830 SDValue Index = MGT->getIndex();
6831 SDValue Scale = MGT->getScale();
6832 EVT VT = Op.getValueType();
6833 EVT MemVT = MGT->getMemoryVT();
6834 ISD::LoadExtType ExtType = MGT->getExtensionType();
6835 ISD::MemIndexType IndexType = MGT->getIndexType();
6836
6837 // SVE supports zero (and so undef) passthrough values only, everything else
6838 // must be handled manually by an explicit select on the load's output.
6839 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6840 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6841 SDValue Load =
6842 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6843 MGT->getMemOperand(), IndexType, ExtType);
6844 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6845 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6846 }
6847
6848 bool IsScaled = MGT->isIndexScaled();
6849 bool IsSigned = MGT->isIndexSigned();
6850
6851 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6852 // must be calculated before hand.
6853 uint64_t ScaleVal = Scale->getAsZExtVal();
6854 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6855 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6856 EVT IndexVT = Index.getValueType();
6857 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6858 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6859 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6860
6861 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6862 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6863 MGT->getMemOperand(), IndexType, ExtType);
6864 }
6865
6866 // Lower fixed length gather to a scalable equivalent.
6867 if (VT.isFixedLengthVector()) {
6868 assert(Subtarget->useSVEForFixedLengthVectors() &&
6869 "Cannot lower when not using SVE for fixed vectors!");
6870
6871 // NOTE: Handle floating-point as if integer then bitcast the result.
6872 EVT DataVT = VT.changeVectorElementTypeToInteger();
6873 MemVT = MemVT.changeVectorElementTypeToInteger();
6874
6875 // Find the smallest integer fixed length vector we can use for the gather.
6876 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6877 if (DataVT.getVectorElementType() == MVT::i64 ||
6878 Index.getValueType().getVectorElementType() == MVT::i64 ||
6879 Mask.getValueType().getVectorElementType() == MVT::i64)
6880 PromotedVT = VT.changeVectorElementType(MVT::i64);
6881
6882 // Promote vector operands except for passthrough, which we know is either
6883 // undef or zero, and thus best constructed directly.
6884 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6885 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6886 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6887
6888 // A promoted result type forces the need for an extending load.
6889 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6890 ExtType = ISD::EXTLOAD;
6891
6892 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6893
6894 // Convert fixed length vector operands to scalable.
6895 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6896 Index = convertToScalableVector(DAG, ContainerVT, Index);
6898 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6899 : DAG.getConstant(0, DL, ContainerVT);
6900
6901 // Emit equivalent scalable vector gather.
6902 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6903 SDValue Load =
6904 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6905 Ops, MGT->getMemOperand(), IndexType, ExtType);
6906
6907 // Extract fixed length data then convert to the required result type.
6908 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6909 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6910 if (VT.isFloatingPoint())
6911 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6912
6913 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6914 }
6915
6916 // Everything else is legal.
6917 return Op;
6918}
6919
6920SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6921 SelectionDAG &DAG) const {
6922 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6923
6924 SDLoc DL(Op);
6925 SDValue Chain = MSC->getChain();
6926 SDValue StoreVal = MSC->getValue();
6927 SDValue Mask = MSC->getMask();
6928 SDValue BasePtr = MSC->getBasePtr();
6929 SDValue Index = MSC->getIndex();
6930 SDValue Scale = MSC->getScale();
6931 EVT VT = StoreVal.getValueType();
6932 EVT MemVT = MSC->getMemoryVT();
6933 ISD::MemIndexType IndexType = MSC->getIndexType();
6934 bool Truncating = MSC->isTruncatingStore();
6935
6936 bool IsScaled = MSC->isIndexScaled();
6937 bool IsSigned = MSC->isIndexSigned();
6938
6939 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6940 // must be calculated before hand.
6941 uint64_t ScaleVal = Scale->getAsZExtVal();
6942 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6943 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6944 EVT IndexVT = Index.getValueType();
6945 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6946 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6947 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6948
6949 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6950 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6951 MSC->getMemOperand(), IndexType, Truncating);
6952 }
6953
6954 // Lower fixed length scatter to a scalable equivalent.
6955 if (VT.isFixedLengthVector()) {
6956 assert(Subtarget->useSVEForFixedLengthVectors() &&
6957 "Cannot lower when not using SVE for fixed vectors!");
6958
6959 // Once bitcast we treat floating-point scatters as if integer.
6960 if (VT.isFloatingPoint()) {
6962 MemVT = MemVT.changeVectorElementTypeToInteger();
6963 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6964 }
6965
6966 // Find the smallest integer fixed length vector we can use for the scatter.
6967 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6968 if (VT.getVectorElementType() == MVT::i64 ||
6969 Index.getValueType().getVectorElementType() == MVT::i64 ||
6970 Mask.getValueType().getVectorElementType() == MVT::i64)
6971 PromotedVT = VT.changeVectorElementType(MVT::i64);
6972
6973 // Promote vector operands.
6974 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6975 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6976 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6977 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6978
6979 // A promoted value type forces the need for a truncating store.
6980 if (PromotedVT != VT)
6981 Truncating = true;
6982
6983 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6984
6985 // Convert fixed length vector operands to scalable.
6986 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6987 Index = convertToScalableVector(DAG, ContainerVT, Index);
6989 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6990
6991 // Emit equivalent scalable vector scatter.
6992 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6993 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6994 MSC->getMemOperand(), IndexType, Truncating);
6995 }
6996
6997 // Everything else is legal.
6998 return Op;
6999}
7000
7001SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7002 SDLoc DL(Op);
7003 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7004 assert(LoadNode && "Expected custom lowering of a masked load node");
7005 EVT VT = Op->getValueType(0);
7006
7007 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7008 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7009
7010 SDValue PassThru = LoadNode->getPassThru();
7011 SDValue Mask = LoadNode->getMask();
7012
7013 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7014 return Op;
7015
7017 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7018 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7019 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7020 LoadNode->getExtensionType());
7021
7022 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7023
7024 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7025}
7026
7027// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7029 EVT VT, EVT MemVT,
7030 SelectionDAG &DAG) {
7031 assert(VT.isVector() && "VT should be a vector type");
7032 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7033
7034 SDValue Value = ST->getValue();
7035
7036 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7037 // the word lane which represent the v4i8 subvector. It optimizes the store
7038 // to:
7039 //
7040 // xtn v0.8b, v0.8h
7041 // str s0, [x0]
7042
7043 SDValue Undef = DAG.getUNDEF(MVT::i16);
7044 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7045 {Undef, Undef, Undef, Undef});
7046
7047 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7048 Value, UndefVec);
7049 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7050
7051 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7052 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7053 Trunc, DAG.getConstant(0, DL, MVT::i64));
7054
7055 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7056 ST->getBasePtr(), ST->getMemOperand());
7057}
7058
7060 SDLoc DL(Op);
7061 SDValue Src = Op.getOperand(0);
7062 MVT DestVT = Op.getSimpleValueType();
7063 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7065
7066 unsigned SrcAS = N->getSrcAddressSpace();
7067 unsigned DestAS = N->getDestAddressSpace();
7068 assert(SrcAS != DestAS &&
7069 "addrspacecast must be between different address spaces");
7070 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7071 TLI.getTargetMachine().getPointerSize(DestAS) &&
7072 "addrspacecast must be between different ptr sizes");
7073 (void)TLI;
7074
7075 if (SrcAS == ARM64AS::PTR32_SPTR) {
7076 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7077 DAG.getTargetConstant(0, DL, DestVT));
7078 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7079 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7080 DAG.getTargetConstant(0, DL, DestVT));
7081 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7082 (DestAS == ARM64AS::PTR32_UPTR)) {
7083 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7084 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7085 return Trunc;
7086 } else {
7087 return Src;
7088 }
7089}
7090
7091// Custom lowering for any store, vector or scalar and/or default or with
7092// a truncate operations. Currently only custom lower truncate operation
7093// from vector v4i16 to v4i8 or volatile stores of i128.
7094SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7095 SelectionDAG &DAG) const {
7096 SDLoc Dl(Op);
7097 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7098 assert (StoreNode && "Can only custom lower store nodes");
7099
7100 SDValue Value = StoreNode->getValue();
7101
7102 EVT VT = Value.getValueType();
7103 EVT MemVT = StoreNode->getMemoryVT();
7104
7105 if (VT.isVector()) {
7107 VT,
7108 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7109 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7110
7111 unsigned AS = StoreNode->getAddressSpace();
7112 Align Alignment = StoreNode->getAlign();
7113 if (Alignment < MemVT.getStoreSize() &&
7114 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7115 StoreNode->getMemOperand()->getFlags(),
7116 nullptr)) {
7117 return scalarizeVectorStore(StoreNode, DAG);
7118 }
7119
7120 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7121 MemVT == MVT::v4i8) {
7122 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7123 }
7124 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7125 // the custom lowering, as there are no un-paired non-temporal stores and
7126 // legalization will break up 256 bit inputs.
7127 ElementCount EC = MemVT.getVectorElementCount();
7128 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7129 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7130 (MemVT.getScalarSizeInBits() == 8u ||
7131 MemVT.getScalarSizeInBits() == 16u ||
7132 MemVT.getScalarSizeInBits() == 32u ||
7133 MemVT.getScalarSizeInBits() == 64u)) {
7134 SDValue Lo =
7137 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7138 SDValue Hi =
7141 StoreNode->getValue(),
7142 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7144 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7145 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7146 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7147 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7148 return Result;
7149 }
7150 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7151 return LowerStore128(Op, DAG);
7152 } else if (MemVT == MVT::i64x8) {
7153 SDValue Value = StoreNode->getValue();
7154 assert(Value->getValueType(0) == MVT::i64x8);
7155 SDValue Chain = StoreNode->getChain();
7156 SDValue Base = StoreNode->getBasePtr();
7157 EVT PtrVT = Base.getValueType();
7158 for (unsigned i = 0; i < 8; i++) {
7159 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7160 Value, DAG.getConstant(i, Dl, MVT::i32));
7161 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7162 DAG.getConstant(i * 8, Dl, PtrVT));
7163 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7164 StoreNode->getBaseAlign());
7165 }
7166 return Chain;
7167 }
7168
7169 return SDValue();
7170}
7171
7172/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7173SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7174 SelectionDAG &DAG) const {
7175 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7176 assert(StoreNode->getMemoryVT() == MVT::i128);
7177 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7178
7179 bool IsStoreRelease =
7181 if (StoreNode->isAtomic())
7182 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7183 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7186
7187 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7188 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7189 ? StoreNode->getOperand(1)
7190 : StoreNode->getOperand(2);
7191 SDLoc DL(Op);
7192 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7193 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7194 if (DAG.getDataLayout().isBigEndian())
7195 std::swap(StoreValue.first, StoreValue.second);
7197 Opcode, DL, DAG.getVTList(MVT::Other),
7198 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7199 StoreNode->getBasePtr()},
7200 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7201 return Result;
7202}
7203
7204SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7205 SelectionDAG &DAG) const {
7206 SDLoc DL(Op);
7207 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7208 assert(LoadNode && "Expected custom lowering of a load node");
7209
7210 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7212 SDValue Base = LoadNode->getBasePtr();
7213 SDValue Chain = LoadNode->getChain();
7214 EVT PtrVT = Base.getValueType();
7215 for (unsigned i = 0; i < 8; i++) {
7216 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7217 DAG.getConstant(i * 8, DL, PtrVT));
7218 SDValue Part =
7219 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7220 LoadNode->getBaseAlign());
7221 Ops.push_back(Part);
7222 Chain = SDValue(Part.getNode(), 1);
7223 }
7224 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7225 return DAG.getMergeValues({Loaded, Chain}, DL);
7226 }
7227
7228 // Custom lowering for extending v4i8 vector loads.
7229 EVT VT = Op->getValueType(0);
7230 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7231
7232 if (LoadNode->getMemoryVT() != MVT::v4i8)
7233 return SDValue();
7234
7235 // Avoid generating unaligned loads.
7236 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7237 return SDValue();
7238
7239 unsigned ExtType;
7240 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7241 ExtType = ISD::SIGN_EXTEND;
7242 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7243 LoadNode->getExtensionType() == ISD::EXTLOAD)
7244 ExtType = ISD::ZERO_EXTEND;
7245 else
7246 return SDValue();
7247
7248 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7249 LoadNode->getBasePtr(), MachinePointerInfo());
7250 SDValue Chain = Load.getValue(1);
7251 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7252 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7253 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7254 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7255 DAG.getConstant(0, DL, MVT::i64));
7256 if (VT == MVT::v4i32)
7257 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7258 return DAG.getMergeValues({Ext, Chain}, DL);
7259}
7260
7261SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7262 SelectionDAG &DAG) const {
7263 SDLoc DL(Op);
7264 SDValue Vec = Op.getOperand(0);
7265 SDValue Mask = Op.getOperand(1);
7266 SDValue Passthru = Op.getOperand(2);
7267 EVT VecVT = Vec.getValueType();
7268 EVT MaskVT = Mask.getValueType();
7269 EVT ElmtVT = VecVT.getVectorElementType();
7270 const bool IsFixedLength = VecVT.isFixedLengthVector();
7271 const bool HasPassthru = !Passthru.isUndef();
7272 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7273 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7274
7275 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7276
7277 if (!Subtarget->isSVEAvailable())
7278 return SDValue();
7279
7280 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7281 return SDValue();
7282
7283 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7284 if (MinElmts != 2 && MinElmts != 4)
7285 return SDValue();
7286
7287 // We can use the SVE register containing the NEON vector in its lowest bits.
7288 if (IsFixedLength) {
7289 EVT ScalableVecVT =
7290 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7291 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7292 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7293
7294 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7295 DAG.getUNDEF(ScalableVecVT), Vec,
7296 DAG.getConstant(0, DL, MVT::i64));
7297 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7298 DAG.getUNDEF(ScalableMaskVT), Mask,
7299 DAG.getConstant(0, DL, MVT::i64));
7301 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7302 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7303 DAG.getUNDEF(ScalableVecVT), Passthru,
7304 DAG.getConstant(0, DL, MVT::i64));
7305
7306 VecVT = Vec.getValueType();
7307 MaskVT = Mask.getValueType();
7308 }
7309
7310 // Get legal type for compact instruction
7311 EVT ContainerVT = getSVEContainerType(VecVT);
7312 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7313
7314 // Convert to i32 or i64 for smaller types, as these are the only supported
7315 // sizes for compact.
7316 if (ContainerVT != VecVT) {
7317 Vec = DAG.getBitcast(CastVT, Vec);
7318 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7319 }
7320
7321 SDValue Compressed = DAG.getNode(
7323 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7324
7325 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7326 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7327 SDValue Offset = DAG.getNode(
7328 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7329 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7330
7331 SDValue IndexMask = DAG.getNode(
7332 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7333 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7334 DAG.getConstant(0, DL, MVT::i64), Offset);
7335
7336 Compressed =
7337 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7338 }
7339
7340 // Extracting from a legal SVE type before truncating produces better code.
7341 if (IsFixedLength) {
7342 Compressed = DAG.getNode(
7344 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7345 Compressed, DAG.getConstant(0, DL, MVT::i64));
7346 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7347 VecVT = FixedVecVT;
7348 }
7349
7350 // If we changed the element type before, we need to convert it back.
7351 if (ContainerVT != VecVT) {
7352 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7353 Compressed = DAG.getBitcast(VecVT, Compressed);
7354 }
7355
7356 return Compressed;
7357}
7358
7359// Generate SUBS and CSEL for integer abs.
7360SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7361 MVT VT = Op.getSimpleValueType();
7362
7363 if (VT.isVector())
7364 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7365
7366 SDLoc DL(Op);
7367 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7368
7369 // Generate SUBS & CSEL.
7370 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7371 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7372 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7373 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7374}
7375
7377 SDValue Chain = Op.getOperand(0);
7378 SDValue Cond = Op.getOperand(1);
7379 SDValue Dest = Op.getOperand(2);
7380
7382 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7383 SDLoc DL(Op);
7384 SDValue CCVal = getCondCode(DAG, CC);
7385 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7386 Cmp);
7387 }
7388
7389 return SDValue();
7390}
7391
7392// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7393// FSHL is converted to FSHR before deciding what to do with it
7395 SDValue Shifts = Op.getOperand(2);
7396 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7397 // If opcode is FSHL, convert it to FSHR
7398 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7399 SDLoc DL(Op);
7400 MVT VT = Op.getSimpleValueType();
7401 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7402
7403 if (Op.getOpcode() == ISD::FSHL) {
7404 if (NewShiftNo == 0)
7405 return Op.getOperand(0);
7406
7407 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7408 return DAG.getNode(
7409 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7410 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7411 }
7412
7413 if (Op.getOpcode() == ISD::FSHR) {
7414 if (NewShiftNo == 0)
7415 return Op.getOperand(1);
7416
7417 if (ShiftNo->getZExtValue() == NewShiftNo)
7418 return Op;
7419
7420 // Rewrite using the normalised shift amount.
7421 return DAG.getNode(
7422 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7423 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7424 }
7425 }
7426
7427 return SDValue();
7428}
7429
7431 SDValue X = Op.getOperand(0);
7432 EVT XScalarTy = X.getValueType();
7433 SDValue Exp = Op.getOperand(1);
7434
7435 SDLoc DL(Op);
7436 EVT XVT, ExpVT;
7437 switch (Op.getSimpleValueType().SimpleTy) {
7438 default:
7439 return SDValue();
7440 case MVT::bf16:
7441 case MVT::f16:
7442 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7443 [[fallthrough]];
7444 case MVT::f32:
7445 XVT = MVT::nxv4f32;
7446 ExpVT = MVT::nxv4i32;
7447 break;
7448 case MVT::f64:
7449 XVT = MVT::nxv2f64;
7450 ExpVT = MVT::nxv2i64;
7451 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7452 break;
7453 }
7454
7455 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7456 SDValue VX =
7457 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7458 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7459 DAG.getUNDEF(ExpVT), Exp, Zero);
7460 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7461 AArch64SVEPredPattern::all);
7462 SDValue FScale =
7464 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7465 VPg, VX, VExp);
7466 SDValue Final =
7467 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7468 if (X.getValueType() != XScalarTy)
7469 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7470 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7471 return Final;
7472}
7473
7474SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7475 SelectionDAG &DAG) const {
7476 return Op.getOperand(0);
7477}
7478
7479SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7480 SelectionDAG &DAG) const {
7481 SDValue Chain = Op.getOperand(0);
7482 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7483 SDValue FPtr = Op.getOperand(2); // nested function
7484 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7485
7486 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7487
7488 // ldr NestReg, .+16
7489 // ldr x17, .+20
7490 // br x17
7491 // .word 0
7492 // .nest: .qword nest
7493 // .fptr: .qword fptr
7494 SDValue OutChains[5];
7495
7496 const Function *Func =
7497 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7498 CallingConv::ID CC = Func->getCallingConv();
7499 unsigned NestReg;
7500
7501 switch (CC) {
7502 default:
7503 NestReg = 0x0f; // X15
7504 break;
7506 // Must be kept in sync with AArch64CallingConv.td
7507 NestReg = 0x04; // X4
7508 break;
7509 }
7510
7511 const char FptrReg = 0x11; // X17
7512
7513 SDValue Addr = Trmp;
7514
7515 SDLoc DL(Op);
7516 OutChains[0] = DAG.getStore(
7517 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7518 MachinePointerInfo(TrmpAddr));
7519
7520 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7521 DAG.getConstant(4, DL, MVT::i64));
7522 OutChains[1] = DAG.getStore(
7523 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7524 MachinePointerInfo(TrmpAddr, 4));
7525
7526 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7527 DAG.getConstant(8, DL, MVT::i64));
7528 OutChains[2] =
7529 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7530 MachinePointerInfo(TrmpAddr, 8));
7531
7532 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7533 DAG.getConstant(16, DL, MVT::i64));
7534 OutChains[3] =
7535 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7536
7537 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7538 DAG.getConstant(24, DL, MVT::i64));
7539 OutChains[4] =
7540 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7541
7542 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7543
7544 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7545 DAG.getConstant(12, DL, MVT::i64));
7546
7547 // Call clear cache on the trampoline instructions.
7548 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7549 EndOfTrmp);
7550}
7551
7553 SelectionDAG &DAG) const {
7554 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7555 LLVM_DEBUG(Op.dump());
7556
7557 switch (Op.getOpcode()) {
7558 default:
7559 llvm_unreachable("unimplemented operand");
7560 return SDValue();
7563 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7564 case ISD::BITCAST:
7565 return LowerBITCAST(Op, DAG);
7566 case ISD::GlobalAddress:
7567 return LowerGlobalAddress(Op, DAG);
7569 return LowerGlobalTLSAddress(Op, DAG);
7571 return LowerPtrAuthGlobalAddress(Op, DAG);
7572 case ISD::ADJUST_TRAMPOLINE:
7573 return LowerADJUST_TRAMPOLINE(Op, DAG);
7574 case ISD::INIT_TRAMPOLINE:
7575 return LowerINIT_TRAMPOLINE(Op, DAG);
7576 case ISD::SETCC:
7577 case ISD::STRICT_FSETCC:
7579 return LowerSETCC(Op, DAG);
7580 case ISD::SETCCCARRY:
7581 return LowerSETCCCARRY(Op, DAG);
7582 case ISD::BRCOND:
7583 return LowerBRCOND(Op, DAG);
7584 case ISD::BR_CC:
7585 return LowerBR_CC(Op, DAG);
7586 case ISD::SELECT:
7587 return LowerSELECT(Op, DAG);
7588 case ISD::SELECT_CC:
7589 return LowerSELECT_CC(Op, DAG);
7590 case ISD::JumpTable:
7591 return LowerJumpTable(Op, DAG);
7592 case ISD::BR_JT:
7593 return LowerBR_JT(Op, DAG);
7594 case ISD::BRIND:
7595 return LowerBRIND(Op, DAG);
7596 case ISD::ConstantPool:
7597 return LowerConstantPool(Op, DAG);
7598 case ISD::BlockAddress:
7599 return LowerBlockAddress(Op, DAG);
7600 case ISD::VASTART:
7601 return LowerVASTART(Op, DAG);
7602 case ISD::VACOPY:
7603 return LowerVACOPY(Op, DAG);
7604 case ISD::VAARG:
7605 return LowerVAARG(Op, DAG);
7606 case ISD::UADDO_CARRY:
7607 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7608 case ISD::USUBO_CARRY:
7609 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7610 case ISD::SADDO_CARRY:
7611 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7612 case ISD::SSUBO_CARRY:
7613 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7614 case ISD::SADDO:
7615 case ISD::UADDO:
7616 case ISD::SSUBO:
7617 case ISD::USUBO:
7618 case ISD::SMULO:
7619 case ISD::UMULO:
7620 return LowerXALUO(Op, DAG);
7621 case ISD::FADD:
7622 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7623 case ISD::FSUB:
7624 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7625 case ISD::FMUL:
7626 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7627 case ISD::FMA:
7628 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7629 case ISD::FDIV:
7630 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7631 case ISD::FNEG:
7632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7633 case ISD::FCEIL:
7634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7635 case ISD::FFLOOR:
7636 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7637 case ISD::FNEARBYINT:
7638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7639 case ISD::FRINT:
7640 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7641 case ISD::FROUND:
7642 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7643 case ISD::FROUNDEVEN:
7644 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7645 case ISD::FTRUNC:
7646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7647 case ISD::FSQRT:
7648 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7649 case ISD::FABS:
7650 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7651 case ISD::FP_ROUND:
7653 return LowerFP_ROUND(Op, DAG);
7654 case ISD::FP_EXTEND:
7656 return LowerFP_EXTEND(Op, DAG);
7657 case ISD::FRAMEADDR:
7658 return LowerFRAMEADDR(Op, DAG);
7659 case ISD::SPONENTRY:
7660 return LowerSPONENTRY(Op, DAG);
7661 case ISD::RETURNADDR:
7662 return LowerRETURNADDR(Op, DAG);
7664 return LowerADDROFRETURNADDR(Op, DAG);
7666 return LowerCONCAT_VECTORS(Op, DAG);
7668 return LowerINSERT_VECTOR_ELT(Op, DAG);
7670 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7671 case ISD::BUILD_VECTOR:
7672 return LowerBUILD_VECTOR(Op, DAG);
7674 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7676 return LowerVECTOR_SHUFFLE(Op, DAG);
7677 case ISD::SPLAT_VECTOR:
7678 return LowerSPLAT_VECTOR(Op, DAG);
7680 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7682 return LowerINSERT_SUBVECTOR(Op, DAG);
7683 case ISD::SDIV:
7684 case ISD::UDIV:
7685 return LowerDIV(Op, DAG);
7686 case ISD::SMIN:
7687 case ISD::UMIN:
7688 case ISD::SMAX:
7689 case ISD::UMAX:
7690 return LowerMinMax(Op, DAG);
7691 case ISD::SRA:
7692 case ISD::SRL:
7693 case ISD::SHL:
7694 return LowerVectorSRA_SRL_SHL(Op, DAG);
7695 case ISD::SHL_PARTS:
7696 case ISD::SRL_PARTS:
7697 case ISD::SRA_PARTS:
7698 return LowerShiftParts(Op, DAG);
7699 case ISD::CTPOP:
7700 case ISD::PARITY:
7701 return LowerCTPOP_PARITY(Op, DAG);
7702 case ISD::FCOPYSIGN:
7703 return LowerFCOPYSIGN(Op, DAG);
7704 case ISD::OR:
7705 return LowerVectorOR(Op, DAG);
7706 case ISD::XOR:
7707 return LowerXOR(Op, DAG);
7708 case ISD::PREFETCH:
7709 return LowerPREFETCH(Op, DAG);
7710 case ISD::SINT_TO_FP:
7711 case ISD::UINT_TO_FP:
7714 return LowerINT_TO_FP(Op, DAG);
7715 case ISD::FP_TO_SINT:
7716 case ISD::FP_TO_UINT:
7719 return LowerFP_TO_INT(Op, DAG);
7722 return LowerFP_TO_INT_SAT(Op, DAG);
7723 case ISD::FSINCOS:
7724 return LowerFSINCOS(Op, DAG);
7725 case ISD::GET_ROUNDING:
7726 return LowerGET_ROUNDING(Op, DAG);
7727 case ISD::SET_ROUNDING:
7728 return LowerSET_ROUNDING(Op, DAG);
7729 case ISD::GET_FPMODE:
7730 return LowerGET_FPMODE(Op, DAG);
7731 case ISD::SET_FPMODE:
7732 return LowerSET_FPMODE(Op, DAG);
7733 case ISD::RESET_FPMODE:
7734 return LowerRESET_FPMODE(Op, DAG);
7735 case ISD::MUL:
7736 return LowerMUL(Op, DAG);
7737 case ISD::MULHS:
7738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7739 case ISD::MULHU:
7740 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7742 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7744 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7746 return LowerINTRINSIC_VOID(Op, DAG);
7747 case ISD::ATOMIC_STORE:
7748 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7749 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7750 return LowerStore128(Op, DAG);
7751 }
7752 return SDValue();
7753 case ISD::STORE:
7754 return LowerSTORE(Op, DAG);
7755 case ISD::MSTORE:
7756 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7757 case ISD::MGATHER:
7758 return LowerMGATHER(Op, DAG);
7759 case ISD::MSCATTER:
7760 return LowerMSCATTER(Op, DAG);
7761 case ISD::VECREDUCE_SEQ_FADD:
7762 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7763 case ISD::VECREDUCE_ADD:
7764 case ISD::VECREDUCE_AND:
7765 case ISD::VECREDUCE_OR:
7766 case ISD::VECREDUCE_XOR:
7767 case ISD::VECREDUCE_SMAX:
7768 case ISD::VECREDUCE_SMIN:
7769 case ISD::VECREDUCE_UMAX:
7770 case ISD::VECREDUCE_UMIN:
7771 case ISD::VECREDUCE_FADD:
7772 case ISD::VECREDUCE_FMAX:
7773 case ISD::VECREDUCE_FMIN:
7774 case ISD::VECREDUCE_FMAXIMUM:
7775 case ISD::VECREDUCE_FMINIMUM:
7776 return LowerVECREDUCE(Op, DAG);
7777 case ISD::ATOMIC_LOAD_AND:
7778 return LowerATOMIC_LOAD_AND(Op, DAG);
7779 case ISD::DYNAMIC_STACKALLOC:
7780 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7781 case ISD::VSCALE:
7782 return LowerVSCALE(Op, DAG);
7784 return LowerVECTOR_COMPRESS(Op, DAG);
7785 case ISD::ANY_EXTEND:
7786 case ISD::SIGN_EXTEND:
7787 case ISD::ZERO_EXTEND:
7788 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7789 case ISD::ADDRSPACECAST:
7790 return LowerADDRSPACECAST(Op, DAG);
7792 // Only custom lower when ExtraVT has a legal byte based element type.
7793 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7794 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7795 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7796 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7797 return SDValue();
7798
7799 return LowerToPredicatedOp(Op, DAG,
7800 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7801 }
7802 case ISD::TRUNCATE:
7803 return LowerTRUNCATE(Op, DAG);
7804 case ISD::MLOAD:
7805 return LowerMLOAD(Op, DAG);
7806 case ISD::LOAD:
7807 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7808 !Subtarget->isNeonAvailable()))
7809 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7810 return LowerLOAD(Op, DAG);
7811 case ISD::ADD:
7812 case ISD::AND:
7813 case ISD::SUB:
7814 return LowerToScalableOp(Op, DAG);
7815 case ISD::FMAXIMUM:
7816 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7817 case ISD::FMAXNUM:
7818 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7819 case ISD::FMINIMUM:
7820 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7821 case ISD::FMINNUM:
7822 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7823 case ISD::VSELECT:
7824 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7825 case ISD::ABS:
7826 return LowerABS(Op, DAG);
7827 case ISD::ABDS:
7828 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7829 case ISD::ABDU:
7830 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7831 case ISD::AVGFLOORS:
7832 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7833 case ISD::AVGFLOORU:
7834 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7835 case ISD::AVGCEILS:
7836 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7837 case ISD::AVGCEILU:
7838 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7839 case ISD::BITREVERSE:
7840 return LowerBitreverse(Op, DAG);
7841 case ISD::BSWAP:
7842 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7843 case ISD::CTLZ:
7844 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7845 case ISD::CTTZ:
7846 return LowerCTTZ(Op, DAG);
7847 case ISD::VECTOR_SPLICE:
7848 return LowerVECTOR_SPLICE(Op, DAG);
7850 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7852 return LowerVECTOR_INTERLEAVE(Op, DAG);
7853 case ISD::GET_ACTIVE_LANE_MASK:
7854 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7855 case ISD::LRINT:
7856 case ISD::LLRINT:
7857 if (Op.getValueType().isVector())
7858 return LowerVectorXRINT(Op, DAG);
7859 [[fallthrough]];
7860 case ISD::LROUND:
7861 case ISD::LLROUND: {
7862 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7863 Op.getOperand(0).getValueType() == MVT::bf16) &&
7864 "Expected custom lowering of rounding operations only for f16");
7865 SDLoc DL(Op);
7866 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7867 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7868 }
7869 case ISD::STRICT_LROUND:
7871 case ISD::STRICT_LRINT:
7872 case ISD::STRICT_LLRINT: {
7873 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7874 Op.getOperand(1).getValueType() == MVT::bf16) &&
7875 "Expected custom lowering of rounding operations only for f16");
7876 SDLoc DL(Op);
7877 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7878 {Op.getOperand(0), Op.getOperand(1)});
7879 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7880 {Ext.getValue(1), Ext.getValue(0)});
7881 }
7882 case ISD::WRITE_REGISTER: {
7883 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7884 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7885 SDLoc DL(Op);
7886
7887 SDValue Chain = Op.getOperand(0);
7888 SDValue SysRegName = Op.getOperand(1);
7889 std::pair<SDValue, SDValue> Pair =
7890 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7891
7892 // chain = MSRR(chain, sysregname, lo, hi)
7893 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7894 SysRegName, Pair.first, Pair.second);
7895
7896 return Result;
7897 }
7898 case ISD::FSHL:
7899 case ISD::FSHR:
7900 return LowerFunnelShift(Op, DAG);
7901 case ISD::FLDEXP:
7902 return LowerFLDEXP(Op, DAG);
7903 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7904 return LowerVECTOR_HISTOGRAM(Op, DAG);
7905 case ISD::PARTIAL_REDUCE_SMLA:
7906 case ISD::PARTIAL_REDUCE_UMLA:
7907 case ISD::PARTIAL_REDUCE_SUMLA:
7908 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7909 }
7910}
7911
7913 return !Subtarget->useSVEForFixedLengthVectors();
7914}
7915
7917 EVT VT, bool OverrideNEON) const {
7918 if (!VT.isFixedLengthVector() || !VT.isSimple())
7919 return false;
7920
7921 // Don't use SVE for vectors we cannot scalarize if required.
7922 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7923 // Fixed length predicates should be promoted to i8.
7924 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7925 case MVT::i1:
7926 default:
7927 return false;
7928 case MVT::i8:
7929 case MVT::i16:
7930 case MVT::i32:
7931 case MVT::i64:
7932 case MVT::f16:
7933 case MVT::f32:
7934 case MVT::f64:
7935 break;
7936 }
7937
7938 // NEON-sized vectors can be emulated using SVE instructions.
7939 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7940 return Subtarget->isSVEorStreamingSVEAvailable();
7941
7942 // Ensure NEON MVTs only belong to a single register class.
7943 if (VT.getFixedSizeInBits() <= 128)
7944 return false;
7945
7946 // Ensure wider than NEON code generation is enabled.
7947 if (!Subtarget->useSVEForFixedLengthVectors())
7948 return false;
7949
7950 // Don't use SVE for types that don't fit.
7951 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7952 return false;
7953
7954 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7955 // the base fixed length SVE support in place.
7956 if (!VT.isPow2VectorType())
7957 return false;
7958
7959 return true;
7960}
7961
7962//===----------------------------------------------------------------------===//
7963// Calling Convention Implementation
7964//===----------------------------------------------------------------------===//
7965
7966static unsigned getIntrinsicID(const SDNode *N) {
7967 unsigned Opcode = N->getOpcode();
7968 switch (Opcode) {
7969 default:
7972 unsigned IID = N->getConstantOperandVal(0);
7973 if (IID < Intrinsic::num_intrinsics)
7974 return IID;
7976 }
7977 }
7978}
7979
7981 SDValue N1) const {
7982 if (!N0.hasOneUse())
7983 return false;
7984
7985 unsigned IID = getIntrinsicID(N1.getNode());
7986 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7987 if (IID == Intrinsic::aarch64_neon_umull ||
7988 N1.getOpcode() == AArch64ISD::UMULL ||
7989 IID == Intrinsic::aarch64_neon_smull ||
7990 N1.getOpcode() == AArch64ISD::SMULL)
7991 return N0.getOpcode() != ISD::ADD;
7992
7993 return true;
7994}
7995
7996/// Selects the correct CCAssignFn for a given CallingConvention value.
7998 bool IsVarArg) const {
7999 switch (CC) {
8000 default:
8001 reportFatalUsageError("unsupported calling convention");
8002 case CallingConv::GHC:
8003 return CC_AArch64_GHC;
8005 // The VarArg implementation makes assumptions about register
8006 // argument passing that do not hold for preserve_none, so we
8007 // instead fall back to C argument passing.
8008 // The non-vararg case is handled in the CC function itself.
8009 if (!IsVarArg)
8011 [[fallthrough]];
8012 case CallingConv::C:
8013 case CallingConv::Fast:
8017 case CallingConv::Swift:
8019 case CallingConv::Tail:
8020 case CallingConv::GRAAL:
8021 if (Subtarget->isTargetWindows()) {
8022 if (IsVarArg) {
8023 if (Subtarget->isWindowsArm64EC())
8026 }
8027 return CC_AArch64_Win64PCS;
8028 }
8029 if (!Subtarget->isTargetDarwin())
8030 return CC_AArch64_AAPCS;
8031 if (!IsVarArg)
8032 return CC_AArch64_DarwinPCS;
8033 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8035 case CallingConv::Win64:
8036 if (IsVarArg) {
8037 if (Subtarget->isWindowsArm64EC())
8040 }
8041 return CC_AArch64_Win64PCS;
8043 if (Subtarget->isWindowsArm64EC())
8051 return CC_AArch64_AAPCS;
8056 }
8057}
8058
8059CCAssignFn *
8061 switch (CC) {
8062 default:
8063 return RetCC_AArch64_AAPCS;
8067 if (Subtarget->isWindowsArm64EC())
8069 return RetCC_AArch64_AAPCS;
8070 }
8071}
8072
8073static bool isPassedInFPR(EVT VT) {
8074 return VT.isFixedLengthVector() ||
8075 (VT.isFloatingPoint() && !VT.isScalableVector());
8076}
8077
8079 AArch64FunctionInfo &FuncInfo,
8080 SelectionDAG &DAG) {
8081 if (!FuncInfo.hasZT0SpillSlotIndex())
8082 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8083
8084 return DAG.getFrameIndex(
8085 FuncInfo.getZT0SpillSlotIndex(),
8087}
8088
8089// Emit a call to __arm_sme_save or __arm_sme_restore.
8091 SelectionDAG &DAG,
8093 SDValue Chain, bool IsSave) {
8096 FuncInfo->setSMESaveBufferUsed();
8098 Args.emplace_back(
8099 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8101
8102 RTLIB::Libcall LC =
8103 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8104 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
8105 TLI.getPointerTy(DAG.getDataLayout()));
8106 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8108 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8109 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
8110 return TLI.LowerCallTo(CLI).second;
8111}
8112
8114 const AArch64TargetLowering &TLI,
8115 const AArch64RegisterInfo &TRI,
8116 AArch64FunctionInfo &FuncInfo,
8117 SelectionDAG &DAG) {
8118 // Conditionally restore the lazy save using a pseudo node.
8119 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8120 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8121 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8123 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8124 TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
8125 SDValue TPIDR2_EL0 = DAG.getNode(
8126 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8127 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8128 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8129 // RESTORE_ZA pseudo.
8130 SDValue Glue;
8131 SDValue TPIDR2Block = DAG.getFrameIndex(
8132 TPIDR2.FrameIndex,
8134 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8135 Chain =
8136 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8137 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8138 RestoreRoutine, RegMask, Chain.getValue(1)});
8139 // Finally reset the TPIDR2_EL0 register to 0.
8140 Chain = DAG.getNode(
8141 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8142 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8143 DAG.getConstant(0, DL, MVT::i64));
8144 TPIDR2.Uses++;
8145 return Chain;
8146}
8147
8148SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8149 SelectionDAG &DAG) const {
8150 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8151 SDValue Glue = Chain.getValue(1);
8152
8153 MachineFunction &MF = DAG.getMachineFunction();
8154 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8155 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8156 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8157
8158 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8159
8160 // The following conditions are true on entry to an exception handler:
8161 // - PSTATE.SM is 0.
8162 // - PSTATE.ZA is 0.
8163 // - TPIDR2_EL0 is null.
8164 // See:
8165 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8166 //
8167 // Therefore, if the function that contains this exception handler is a
8168 // streaming[-compatible] function, we must re-enable streaming mode.
8169 //
8170 // These mode changes are usually optimized away in catch blocks as they
8171 // occur before the __cxa_begin_catch (which is a non-streaming function),
8172 // but are necessary in some cases (such as for cleanups).
8173 //
8174 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8175
8176 // [COND_]SMSTART SM
8177 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8178 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8179 /*Glue*/ Glue, AArch64SME::Always);
8180 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8181 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8183
8184 if (getTM().useNewSMEABILowering())
8185 return Chain;
8186
8187 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8188 // Restore full ZA
8189 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8190 /*IsSave=*/false);
8191 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8192 // SMSTART ZA
8193 Chain = DAG.getNode(
8194 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8195 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8196
8197 // Restore ZT0
8198 if (SMEFnAttrs.hasZT0State()) {
8199 SDValue ZT0FrameIndex =
8200 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8201 Chain =
8202 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8203 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8204 }
8205
8206 // Restore ZA
8207 if (SMEFnAttrs.hasZAState())
8208 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8209 }
8210
8211 return Chain;
8212}
8213
8214SDValue AArch64TargetLowering::LowerFormalArguments(
8215 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8216 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8217 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8218 MachineFunction &MF = DAG.getMachineFunction();
8219 const Function &F = MF.getFunction();
8220 MachineFrameInfo &MFI = MF.getFrameInfo();
8221 bool IsWin64 =
8222 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8223 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8224 (isVarArg && Subtarget->isWindowsArm64EC());
8225 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8226
8228 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8230 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8231 FuncInfo->setIsSVECC(true);
8232
8233 // Assign locations to all of the incoming arguments.
8235 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8236
8237 // At this point, Ins[].VT may already be promoted to i32. To correctly
8238 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8239 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8240 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8241 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8242 // LocVT.
8243 unsigned NumArgs = Ins.size();
8244 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8245 unsigned CurArgIdx = 0;
8246 bool UseVarArgCC = false;
8247 if (IsWin64)
8248 UseVarArgCC = isVarArg;
8249
8250 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8251
8252 for (unsigned i = 0; i != NumArgs; ++i) {
8253 MVT ValVT = Ins[i].VT;
8254 if (Ins[i].isOrigArg()) {
8255 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8256 CurArgIdx = Ins[i].getOrigArgIndex();
8257
8258 // Get type of the original argument.
8259 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8260 /*AllowUnknown*/ true);
8261 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8262 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8263 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8264 ValVT = MVT::i8;
8265 else if (ActualMVT == MVT::i16)
8266 ValVT = MVT::i16;
8267 }
8268 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8269 Ins[i].OrigTy, CCInfo);
8270 assert(!Res && "Call operand has unhandled type");
8271 (void)Res;
8272 }
8273
8274 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8275 bool IsLocallyStreaming =
8276 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8277 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8278 SDValue Glue = Chain.getValue(1);
8279
8280 unsigned ExtraArgLocs = 0;
8281 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8282 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8283
8284 if (Ins[i].Flags.isByVal()) {
8285 // Byval is used for HFAs in the PCS, but the system should work in a
8286 // non-compliant manner for larger structs.
8287 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8288 int Size = Ins[i].Flags.getByValSize();
8289 unsigned NumRegs = (Size + 7) / 8;
8290
8291 // FIXME: This works on big-endian for composite byvals, which are the common
8292 // case. It should also work for fundamental types too.
8293 unsigned FrameIdx =
8294 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8295 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8296 InVals.push_back(FrameIdxN);
8297
8298 continue;
8299 }
8300
8301 if (Ins[i].Flags.isSwiftAsync())
8302 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8303
8304 SDValue ArgValue;
8305 if (VA.isRegLoc()) {
8306 // Arguments stored in registers.
8307 EVT RegVT = VA.getLocVT();
8308 const TargetRegisterClass *RC;
8309
8310 if (RegVT == MVT::i32)
8311 RC = &AArch64::GPR32RegClass;
8312 else if (RegVT == MVT::i64)
8313 RC = &AArch64::GPR64RegClass;
8314 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8315 RC = &AArch64::FPR16RegClass;
8316 else if (RegVT == MVT::f32)
8317 RC = &AArch64::FPR32RegClass;
8318 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8319 RC = &AArch64::FPR64RegClass;
8320 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8321 RC = &AArch64::FPR128RegClass;
8322 else if (RegVT.isScalableVector() &&
8323 RegVT.getVectorElementType() == MVT::i1) {
8324 FuncInfo->setIsSVECC(true);
8325 RC = &AArch64::PPRRegClass;
8326 } else if (RegVT == MVT::aarch64svcount) {
8327 FuncInfo->setIsSVECC(true);
8328 RC = &AArch64::PPRRegClass;
8329 } else if (RegVT.isScalableVector()) {
8330 FuncInfo->setIsSVECC(true);
8331 RC = &AArch64::ZPRRegClass;
8332 } else
8333 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8334
8335 // Transform the arguments in physical registers into virtual ones.
8336 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8337
8338 if (IsLocallyStreaming) {
8339 // LocallyStreamingFunctions must insert the SMSTART in the correct
8340 // position, so we use Glue to ensure no instructions can be scheduled
8341 // between the chain of:
8342 // t0: ch,glue = EntryNode
8343 // t1: res,ch,glue = CopyFromReg
8344 // ...
8345 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8346 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8347 // ^^^^^^
8348 // This will be the new Chain/Root node.
8349 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8350 Glue = ArgValue.getValue(2);
8351 if (isPassedInFPR(ArgValue.getValueType())) {
8352 ArgValue =
8353 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8354 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8355 {ArgValue, Glue});
8356 Glue = ArgValue.getValue(1);
8357 }
8358 } else
8359 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8360
8361 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8362 // to 64 bits. Insert an assert[sz]ext to capture this, then
8363 // truncate to the right size.
8364 switch (VA.getLocInfo()) {
8365 default:
8366 llvm_unreachable("Unknown loc info!");
8367 case CCValAssign::Full:
8368 break;
8370 assert(
8371 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8372 "Indirect arguments should be scalable on most subtargets");
8373 break;
8374 case CCValAssign::BCvt:
8375 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8376 break;
8377 case CCValAssign::AExt:
8378 case CCValAssign::SExt:
8379 case CCValAssign::ZExt:
8380 break;
8382 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8383 DAG.getConstant(32, DL, RegVT));
8384 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8385 break;
8386 }
8387 } else { // VA.isRegLoc()
8388 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8389 unsigned ArgOffset = VA.getLocMemOffset();
8390 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8391 ? VA.getLocVT().getSizeInBits()
8392 : VA.getValVT().getSizeInBits()) / 8;
8393
8394 uint32_t BEAlign = 0;
8395 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8396 !Ins[i].Flags.isInConsecutiveRegs())
8397 BEAlign = 8 - ArgSize;
8398
8399 SDValue FIN;
8400 MachinePointerInfo PtrInfo;
8401 if (StackViaX4) {
8402 // In both the ARM64EC varargs convention and the thunk convention,
8403 // arguments on the stack are accessed relative to x4, not sp. In
8404 // the thunk convention, there's an additional offset of 32 bytes
8405 // to account for the shadow store.
8406 unsigned ObjOffset = ArgOffset + BEAlign;
8407 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8408 ObjOffset += 32;
8409 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8410 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8411 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8412 DAG.getConstant(ObjOffset, DL, MVT::i64));
8414 } else {
8415 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8416
8417 // Create load nodes to retrieve arguments from the stack.
8418 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8419 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8420 }
8421
8422 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8424 MVT MemVT = VA.getValVT();
8425
8426 switch (VA.getLocInfo()) {
8427 default:
8428 break;
8429 case CCValAssign::Trunc:
8430 case CCValAssign::BCvt:
8431 MemVT = VA.getLocVT();
8432 break;
8435 Subtarget->isWindowsArm64EC()) &&
8436 "Indirect arguments should be scalable on most subtargets");
8437 MemVT = VA.getLocVT();
8438 break;
8439 case CCValAssign::SExt:
8440 ExtType = ISD::SEXTLOAD;
8441 break;
8442 case CCValAssign::ZExt:
8443 ExtType = ISD::ZEXTLOAD;
8444 break;
8445 case CCValAssign::AExt:
8446 ExtType = ISD::EXTLOAD;
8447 break;
8448 }
8449
8450 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8451 MemVT);
8452 }
8453
8454 if (VA.getLocInfo() == CCValAssign::Indirect) {
8455 assert((VA.getValVT().isScalableVT() ||
8456 Subtarget->isWindowsArm64EC()) &&
8457 "Indirect arguments should be scalable on most subtargets");
8458
8459 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8460 unsigned NumParts = 1;
8461 if (Ins[i].Flags.isInConsecutiveRegs()) {
8462 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8463 ++NumParts;
8464 }
8465
8466 MVT PartLoad = VA.getValVT();
8467 SDValue Ptr = ArgValue;
8468
8469 // Ensure we generate all loads for each tuple part, whilst updating the
8470 // pointer after each load correctly using vscale.
8471 while (NumParts > 0) {
8472 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8473 InVals.push_back(ArgValue);
8474 NumParts--;
8475 if (NumParts > 0) {
8476 SDValue BytesIncrement;
8477 if (PartLoad.isScalableVector()) {
8478 BytesIncrement = DAG.getVScale(
8479 DL, Ptr.getValueType(),
8480 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8481 } else {
8482 BytesIncrement = DAG.getConstant(
8483 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8484 Ptr.getValueType());
8485 }
8486 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8487 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8488 ExtraArgLocs++;
8489 i++;
8490 }
8491 }
8492 } else {
8493 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8494 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8495 ArgValue, DAG.getValueType(MVT::i32));
8496
8497 // i1 arguments are zero-extended to i8 by the caller. Emit a
8498 // hint to reflect this.
8499 if (Ins[i].isOrigArg()) {
8500 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8501 if (OrigArg->getType()->isIntegerTy(1)) {
8502 if (!Ins[i].Flags.isZExt()) {
8503 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8504 ArgValue.getValueType(), ArgValue);
8505 }
8506 }
8507 }
8508
8509 InVals.push_back(ArgValue);
8510 }
8511 }
8512 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8513
8514 if (Attrs.hasStreamingCompatibleInterface()) {
8515 SDValue EntryPStateSM =
8516 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8517 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8518
8519 // Copy the value to a virtual register, and save that in FuncInfo.
8520 Register EntryPStateSMReg =
8521 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8522 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8523 EntryPStateSM);
8524 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8525 }
8526
8527 // Insert the SMSTART if this is a locally streaming function and
8528 // make sure it is Glued to the last CopyFromReg value.
8529 if (IsLocallyStreaming) {
8530 if (Attrs.hasStreamingCompatibleInterface())
8531 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8533 else
8534 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8536
8537 // Ensure that the SMSTART happens after the CopyWithChain such that its
8538 // chain result is used.
8539 for (unsigned I=0; I<InVals.size(); ++I) {
8542 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8543 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8544 InVals[I].getValueType());
8545 }
8546 }
8547
8548 // varargs
8549 if (isVarArg) {
8551 if (!Subtarget->isTargetDarwin() || IsWin64) {
8552 // The AAPCS variadic function ABI is identical to the non-variadic
8553 // one. As a result there may be more arguments in registers and we
8554 // should save them for future reference.
8555 // Win64 variadic functions also pass arguments in registers, but all
8556 // float arguments are passed in integer registers.
8557 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8558 }
8559
8560 // This will point to the next argument passed via stack.
8561 unsigned VarArgsOffset = CCInfo.getStackSize();
8562 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8563 VarArgsOffset =
8564 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8565 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8566 FuncInfo->setVarArgsStackIndex(
8567 MFI.CreateFixedObject(4, VarArgsOffset, true));
8568 }
8569
8570 if (MFI.hasMustTailInVarArgFunc()) {
8571 SmallVector<MVT, 2> RegParmTypes;
8572 RegParmTypes.push_back(MVT::i64);
8573 RegParmTypes.push_back(MVT::f128);
8574 // Compute the set of forwarded registers. The rest are scratch.
8575 SmallVectorImpl<ForwardedRegister> &Forwards =
8576 FuncInfo->getForwardedMustTailRegParms();
8577 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8579
8580 // Conservatively forward X8, since it might be used for aggregate return.
8581 if (!CCInfo.isAllocated(AArch64::X8)) {
8582 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8583 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8584 }
8585 }
8586 }
8587
8588 // On Windows, InReg pointers must be returned, so record the pointer in a
8589 // virtual register at the start of the function so it can be returned in the
8590 // epilogue.
8591 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8592 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8593 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8594 Ins[I].Flags.isInReg()) &&
8595 Ins[I].Flags.isSRet()) {
8596 assert(!FuncInfo->getSRetReturnReg());
8597
8598 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8599 Register Reg =
8601 FuncInfo->setSRetReturnReg(Reg);
8602
8603 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8604 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8605 break;
8606 }
8607 }
8608 }
8609
8610 unsigned StackArgSize = CCInfo.getStackSize();
8611 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8612 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8613 // This is a non-standard ABI so by fiat I say we're allowed to make full
8614 // use of the stack area to be popped, which must be aligned to 16 bytes in
8615 // any case:
8616 StackArgSize = alignTo(StackArgSize, 16);
8617
8618 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8619 // a multiple of 16.
8620 FuncInfo->setArgumentStackToRestore(StackArgSize);
8621
8622 // This realignment carries over to the available bytes below. Our own
8623 // callers will guarantee the space is free by giving an aligned value to
8624 // CALLSEQ_START.
8625 }
8626 // Even if we're not expected to free up the space, it's useful to know how
8627 // much is there while considering tail calls (because we can reuse it).
8628 FuncInfo->setBytesInStackArgArea(StackArgSize);
8629
8630 if (Subtarget->hasCustomCallingConv())
8631 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8632
8633 if (getTM().useNewSMEABILowering()) {
8634 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8635 SDValue Size;
8636 if (Attrs.hasZAState()) {
8637 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8638 DAG.getConstant(1, DL, MVT::i32));
8639 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8640 } else if (Attrs.hasAgnosticZAInterface()) {
8641 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8644 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8645 TargetLowering::CallLoweringInfo CLI(DAG);
8646 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8647 getLibcallCallingConv(LC), RetTy, Callee, {});
8648 std::tie(Size, Chain) = LowerCallTo(CLI);
8649 }
8650 if (Size) {
8651 SDValue Buffer = DAG.getNode(
8652 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8653 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8654 Chain = Buffer.getValue(1);
8655
8656 Register BufferPtr =
8657 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8658 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8659 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8660 DAG.getVTList(MVT::Other), Chain);
8661 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8662 MFI.CreateVariableSizedObject(Align(16), nullptr);
8663 }
8664 }
8665 } else {
8666 // Old SME ABI lowering (deprecated):
8667 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8668 // will be expanded and stored in the static object later using a
8669 // pseudonode.
8670 if (Attrs.hasZAState()) {
8671 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8672 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8673 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8674 DAG.getConstant(1, DL, MVT::i32));
8675 SDValue Buffer;
8676 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8677 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8678 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8679 } else {
8680 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8681 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8682 DAG.getVTList(MVT::i64, MVT::Other),
8683 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8684 MFI.CreateVariableSizedObject(Align(16), nullptr);
8685 }
8686 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8687 DAG.getConstant(1, DL, MVT::i32));
8688 Chain = DAG.getNode(
8689 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8690 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8691 /*Num save slices*/ NumZaSaveSlices});
8692 } else if (Attrs.hasAgnosticZAInterface()) {
8693 // Call __arm_sme_state_size().
8694 SDValue BufferSize =
8695 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8696 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8697 Chain = BufferSize.getValue(1);
8698 SDValue Buffer;
8699 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8700 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8701 DAG.getVTList(MVT::i64, MVT::Other),
8702 {Chain, BufferSize});
8703 } else {
8704 // Allocate space dynamically.
8705 Buffer = DAG.getNode(
8706 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8707 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8708 MFI.CreateVariableSizedObject(Align(16), nullptr);
8709 }
8710 // Copy the value to a virtual register, and save that in FuncInfo.
8711 Register BufferPtr =
8712 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8713 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8714 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
8715 }
8716 }
8717
8718 if (CallConv == CallingConv::PreserveNone) {
8719 for (const ISD::InputArg &I : Ins) {
8720 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8721 I.Flags.isSwiftAsync()) {
8722 MachineFunction &MF = DAG.getMachineFunction();
8723 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8724 MF.getFunction(),
8725 "Swift attributes can't be used with preserve_none",
8726 DL.getDebugLoc()));
8727 break;
8728 }
8729 }
8730 }
8731
8732 if (getTM().useNewSMEABILowering()) {
8733 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8734 if (Attrs.isNewZT0())
8735 Chain = DAG.getNode(
8736 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8737 DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8738 DAG.getTargetConstant(0, DL, MVT::i32));
8739 }
8740
8741 return Chain;
8742}
8743
8744void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8745 SelectionDAG &DAG,
8746 const SDLoc &DL,
8747 SDValue &Chain) const {
8748 MachineFunction &MF = DAG.getMachineFunction();
8749 MachineFrameInfo &MFI = MF.getFrameInfo();
8750 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8751 auto PtrVT = getPointerTy(DAG.getDataLayout());
8752 Function &F = MF.getFunction();
8753 bool IsWin64 =
8754 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8755
8757
8759 unsigned NumGPRArgRegs = GPRArgRegs.size();
8760 if (Subtarget->isWindowsArm64EC()) {
8761 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8762 // functions.
8763 NumGPRArgRegs = 4;
8764 }
8765 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8766
8767 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8768 int GPRIdx = 0;
8769 if (GPRSaveSize != 0) {
8770 if (IsWin64) {
8771 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8772 if (GPRSaveSize & 15)
8773 // The extra size here, if triggered, will always be 8.
8774 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8775 } else
8776 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8777
8778 SDValue FIN;
8779 if (Subtarget->isWindowsArm64EC()) {
8780 // With the Arm64EC ABI, we reserve the save area as usual, but we
8781 // compute its address relative to x4. For a normal AArch64->AArch64
8782 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8783 // different address.
8784 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8785 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8786 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8787 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8788 } else {
8789 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8790 }
8791
8792 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8793 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8794 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8795 SDValue Store =
8796 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8798 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8799 : MachinePointerInfo::getStack(MF, i * 8));
8800 MemOps.push_back(Store);
8801 FIN =
8802 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8803 }
8804 }
8805 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8806 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8807
8808 if (Subtarget->hasFPARMv8() && !IsWin64) {
8810 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8811 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8812
8813 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8814 int FPRIdx = 0;
8815 if (FPRSaveSize != 0) {
8816 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8817
8818 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8819
8820 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8821 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8822 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8823
8824 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8825 MachinePointerInfo::getStack(MF, i * 16));
8826 MemOps.push_back(Store);
8827 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8828 DAG.getConstant(16, DL, PtrVT));
8829 }
8830 }
8831 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8832 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8833 }
8834
8835 if (!MemOps.empty()) {
8836 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8837 }
8838}
8839
8840/// LowerCallResult - Lower the result values of a call into the
8841/// appropriate copies out of appropriate physical registers.
8842SDValue AArch64TargetLowering::LowerCallResult(
8843 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8844 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8845 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8846 SDValue ThisVal, bool RequiresSMChange) const {
8847 DenseMap<unsigned, SDValue> CopiedRegs;
8848 // Copy all of the result registers out of their specified physreg.
8849 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8850 CCValAssign VA = RVLocs[i];
8851
8852 // Pass 'this' value directly from the argument to return value, to avoid
8853 // reg unit interference
8854 if (i == 0 && isThisReturn) {
8855 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8856 "unexpected return calling convention register assignment");
8857 InVals.push_back(ThisVal);
8858 continue;
8859 }
8860
8861 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8862 // allows one use of a physreg per block.
8863 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8864 if (!Val) {
8865 Val =
8866 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8867 Chain = Val.getValue(1);
8868 InGlue = Val.getValue(2);
8869 CopiedRegs[VA.getLocReg()] = Val;
8870 }
8871
8872 switch (VA.getLocInfo()) {
8873 default:
8874 llvm_unreachable("Unknown loc info!");
8875 case CCValAssign::Full:
8876 break;
8877 case CCValAssign::BCvt:
8878 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8879 break;
8881 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8882 DAG.getConstant(32, DL, VA.getLocVT()));
8883 [[fallthrough]];
8884 case CCValAssign::AExt:
8885 [[fallthrough]];
8886 case CCValAssign::ZExt:
8887 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8888 break;
8889 }
8890
8891 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8892 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8893 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8894
8895 InVals.push_back(Val);
8896 }
8897
8898 return Chain;
8899}
8900
8901/// Return true if the calling convention is one that we can guarantee TCO for.
8902static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8903 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8905}
8906
8907/// Return true if we might ever do TCO for calls with this calling convention.
8909 switch (CC) {
8910 case CallingConv::C:
8915 case CallingConv::Swift:
8917 case CallingConv::Tail:
8918 case CallingConv::Fast:
8919 return true;
8920 default:
8921 return false;
8922 }
8923}
8924
8925/// Return true if the call convention supports varargs
8926/// Currently only those that pass varargs like the C
8927/// calling convention does are eligible
8928/// Calling conventions listed in this function must also
8929/// be properly handled in AArch64Subtarget::isCallingConvWin64
8931 switch (CC) {
8932 case CallingConv::C:
8934 // SVE vector call is only partially supported, but it should
8935 // support named arguments being passed. Any arguments being passed
8936 // as varargs, are still unsupported.
8938 return true;
8939 default:
8940 return false;
8941 }
8942}
8943
8945 const AArch64Subtarget *Subtarget,
8947 CCState &CCInfo) {
8948 const SelectionDAG &DAG = CLI.DAG;
8949 CallingConv::ID CalleeCC = CLI.CallConv;
8950 bool IsVarArg = CLI.IsVarArg;
8951 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8952 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8953
8954 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8955 // for the shadow store.
8956 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8957 CCInfo.AllocateStack(32, Align(16));
8958
8959 unsigned NumArgs = Outs.size();
8960 for (unsigned i = 0; i != NumArgs; ++i) {
8961 MVT ArgVT = Outs[i].VT;
8962 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8963
8964 bool UseVarArgCC = false;
8965 if (IsVarArg) {
8966 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8967 // too, so use the vararg CC to force them to integer registers.
8968 if (IsCalleeWin64) {
8969 UseVarArgCC = true;
8970 } else {
8971 UseVarArgCC = ArgFlags.isVarArg();
8972 }
8973 }
8974
8975 if (!UseVarArgCC) {
8976 // Get type of the original argument.
8977 EVT ActualVT =
8978 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8979 /*AllowUnknown*/ true);
8980 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8981 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8982 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8983 ArgVT = MVT::i8;
8984 else if (ActualMVT == MVT::i16)
8985 ArgVT = MVT::i16;
8986 }
8987
8988 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8989 // argument. This logic should exactly mirror LowerFormalArguments.
8990 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8991 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8992 Outs[i].OrigTy, CCInfo);
8993 assert(!Res && "Call operand has unhandled type");
8994 (void)Res;
8995 }
8996}
8997
8998static SMECallAttrs
9001 if (CLI.CB)
9002 return SMECallAttrs(*CLI.CB, &TLI);
9003 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9004 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
9006}
9007
9008bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9009 const CallLoweringInfo &CLI) const {
9010 CallingConv::ID CalleeCC = CLI.CallConv;
9011 if (!mayTailCallThisCC(CalleeCC))
9012 return false;
9013
9014 SDValue Callee = CLI.Callee;
9015 bool IsVarArg = CLI.IsVarArg;
9016 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9017 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9018 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9019 const SelectionDAG &DAG = CLI.DAG;
9020 MachineFunction &MF = DAG.getMachineFunction();
9021 const Function &CallerF = MF.getFunction();
9022 CallingConv::ID CallerCC = CallerF.getCallingConv();
9023
9024 // SME Streaming functions are not eligible for TCO as they may require
9025 // the streaming mode or ZA to be restored after returning from the call.
9026 SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
9027 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9028 CallAttrs.requiresPreservingAllZAState() ||
9029 CallAttrs.caller().hasStreamingBody())
9030 return false;
9031
9032 // Functions using the C or Fast calling convention that have an SVE signature
9033 // preserve more registers and should assume the SVE_VectorCall CC.
9034 // The check for matching callee-saved regs will determine whether it is
9035 // eligible for TCO.
9036 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9037 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9039
9040 bool CCMatch = CallerCC == CalleeCC;
9041
9042 // When using the Windows calling convention on a non-windows OS, we want
9043 // to back up and restore X18 in such functions; we can't do a tail call
9044 // from those functions.
9045 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9046 CalleeCC != CallingConv::Win64)
9047 return false;
9048
9049 // Byval parameters hand the function a pointer directly into the stack area
9050 // we want to reuse during a tail call. Working around this *is* possible (see
9051 // X86) but less efficient and uglier in LowerCall.
9052 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9053 e = CallerF.arg_end();
9054 i != e; ++i) {
9055 if (i->hasByValAttr())
9056 return false;
9057
9058 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9059 // In this case, it is necessary to save X0/X1 in the callee and return it
9060 // in X0. Tail call opt may interfere with this, so we disable tail call
9061 // opt when the caller has an "inreg" attribute -- except if the callee
9062 // also has that attribute on the same argument, and the same value is
9063 // passed.
9064 if (i->hasInRegAttr()) {
9065 unsigned ArgIdx = i - CallerF.arg_begin();
9066 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9067 return false;
9068 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9069 if (!Attrs.hasAttribute(Attribute::InReg) ||
9070 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9071 CLI.CB->getArgOperand(ArgIdx) != i) {
9072 return false;
9073 }
9074 }
9075 }
9076
9077 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9078 return CCMatch;
9079
9080 // Externally-defined functions with weak linkage should not be
9081 // tail-called on AArch64 when the OS does not support dynamic
9082 // pre-emption of symbols, as the AAELF spec requires normal calls
9083 // to undefined weak functions to be replaced with a NOP or jump to the
9084 // next instruction. The behaviour of branch instructions in this
9085 // situation (as used for tail calls) is implementation-defined, so we
9086 // cannot rely on the linker replacing the tail call with a return.
9087 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9088 const GlobalValue *GV = G->getGlobal();
9089 const Triple &TT = getTargetMachine().getTargetTriple();
9090 if (GV->hasExternalWeakLinkage() &&
9091 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9092 return false;
9093 }
9094
9095 // Now we search for cases where we can use a tail call without changing the
9096 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9097 // concept.
9098
9099 // I want anyone implementing a new calling convention to think long and hard
9100 // about this assert.
9101 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9102 report_fatal_error("Unsupported variadic calling convention");
9103
9104 LLVMContext &C = *DAG.getContext();
9105 // Check that the call results are passed in the same way.
9106 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9107 CCAssignFnForCall(CalleeCC, IsVarArg),
9108 CCAssignFnForCall(CallerCC, IsVarArg)))
9109 return false;
9110 // The callee has to preserve all registers the caller needs to preserve.
9111 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9112 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9113 if (!CCMatch) {
9114 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9115 if (Subtarget->hasCustomCallingConv()) {
9116 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9117 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9118 }
9119 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9120 return false;
9121 }
9122
9123 // Nothing more to check if the callee is taking no arguments
9124 if (Outs.empty())
9125 return true;
9126
9128 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9129
9130 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9131
9132 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9133 // When we are musttail, additional checks have been done and we can safely ignore this check
9134 // At least two cases here: if caller is fastcc then we can't have any
9135 // memory arguments (we'd be expected to clean up the stack afterwards). If
9136 // caller is C then we could potentially use its argument area.
9137
9138 // FIXME: for now we take the most conservative of these in both cases:
9139 // disallow all variadic memory operands.
9140 for (const CCValAssign &ArgLoc : ArgLocs)
9141 if (!ArgLoc.isRegLoc())
9142 return false;
9143 }
9144
9145 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9146
9147 // If any of the arguments is passed indirectly, it must be SVE, so the
9148 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9149 // allocate space on the stack. That is why we determine this explicitly here
9150 // the call cannot be a tailcall.
9151 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9152 assert((A.getLocInfo() != CCValAssign::Indirect ||
9153 A.getValVT().isScalableVector() ||
9154 Subtarget->isWindowsArm64EC()) &&
9155 "Expected value to be scalable");
9156 return A.getLocInfo() == CCValAssign::Indirect;
9157 }))
9158 return false;
9159
9160 // If the stack arguments for this call do not fit into our own save area then
9161 // the call cannot be made tail.
9162 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9163 return false;
9164
9165 const MachineRegisterInfo &MRI = MF.getRegInfo();
9166 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9167 return false;
9168
9169 return true;
9170}
9171
9172SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9173 SelectionDAG &DAG,
9174 MachineFrameInfo &MFI,
9175 int ClobberedFI) const {
9176 SmallVector<SDValue, 8> ArgChains;
9177 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9178 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9179
9180 // Include the original chain at the beginning of the list. When this is
9181 // used by target LowerCall hooks, this helps legalize find the
9182 // CALLSEQ_BEGIN node.
9183 ArgChains.push_back(Chain);
9184
9185 // Add a chain value for each stack argument corresponding
9186 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9187 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9188 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9189 if (FI->getIndex() < 0) {
9190 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9191 int64_t InLastByte = InFirstByte;
9192 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9193
9194 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9195 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9196 ArgChains.push_back(SDValue(L, 1));
9197 }
9198
9199 // Build a tokenfactor for all the chains.
9200 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9201}
9202
9203bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9204 bool TailCallOpt) const {
9205 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9206 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9207}
9208
9209// Check if the value is zero-extended from i1 to i8
9210static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9211 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9212 if (SizeInBits < 8)
9213 return false;
9214
9215 APInt RequiredZero(SizeInBits, 0xFE);
9216 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9217 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9218 return ZExtBool;
9219}
9220
9221void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9222 SDNode *Node) const {
9223 // Live-in physreg copies that are glued to SMSTART are applied as
9224 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9225 // register allocator to pass call args in callee saved regs, without extra
9226 // copies to avoid these fake clobbers of actually-preserved GPRs.
9227 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9228 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9229 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9230 if (MachineOperand &MO = MI.getOperand(I);
9231 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9232 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9233 AArch64::GPR64RegClass.contains(MO.getReg())))
9234 MI.removeOperand(I);
9235
9236 // The SVE vector length can change when entering/leaving streaming mode.
9237 // FPMR is set to 0 when entering/leaving streaming mode.
9238 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9239 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9240 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9241 /*IsImplicit=*/true));
9242 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9243 /*IsImplicit=*/true));
9244 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9245 /*IsImplicit=*/true));
9246 }
9247 }
9248
9249 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9250 // have nothing to do with VG, were it not that they are used to materialise a
9251 // frame-address. If they contain a frame-index to a scalable vector, this
9252 // will likely require an ADDVL instruction to materialise the address, thus
9253 // reading VG.
9254 const MachineFunction &MF = *MI.getMF();
9255 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9256 (MI.getOpcode() == AArch64::ADDXri ||
9257 MI.getOpcode() == AArch64::SUBXri)) {
9258 const MachineOperand &MO = MI.getOperand(1);
9259 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9260 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9261 /*IsImplicit=*/true));
9262 }
9263}
9264
9266 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9267 unsigned Condition, bool InsertVectorLengthCheck) const {
9270 FuncInfo->setHasStreamingModeChanges(true);
9271
9272 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9273 SmallVector<SDValue, 2> Ops = {Chain};
9274 if (InGlue)
9275 Ops.push_back(InGlue);
9276 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9277 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9278 };
9279
9280 if (InsertVectorLengthCheck && Enable) {
9281 // Non-streaming -> Streaming
9282 // Insert vector length check before smstart
9283 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9284 Chain = CheckVL.getValue(0);
9285 InGlue = CheckVL.getValue(1);
9286 }
9287
9288 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9289 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9290 SDValue MSROp =
9291 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9292 SmallVector<SDValue> Ops = {Chain, MSROp};
9293 unsigned Opcode;
9294 if (Condition != AArch64SME::Always) {
9295 Register PStateReg = FuncInfo->getPStateSMReg();
9296 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9297 SDValue PStateSM =
9298 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9299 // Use chain and glue from the CopyFromReg.
9300 Ops[0] = PStateSM.getValue(1);
9301 InGlue = PStateSM.getValue(2);
9302 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9303 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9304 Ops.push_back(ConditionOp);
9305 Ops.push_back(PStateSM);
9306 } else {
9307 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9308 }
9309 Ops.push_back(RegMask);
9310
9311 if (InGlue)
9312 Ops.push_back(InGlue);
9313
9314 SDValue SMChange =
9315 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9316
9317 if (!InsertVectorLengthCheck || Enable)
9318 return SMChange;
9319
9320 // Streaming -> Non-streaming
9321 // Insert vector length check after smstop since we cannot read VL
9322 // in streaming mode
9323 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9324}
9325
9328 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9329 CallAttrs.caller().hasStreamingBody())
9330 return AArch64SME::Always;
9331 if (CallAttrs.callee().hasNonStreamingInterface())
9333 if (CallAttrs.callee().hasStreamingInterface())
9335
9336 llvm_unreachable("Unsupported attributes");
9337}
9338
9339/// Check whether a stack argument requires lowering in a tail call.
9341 const CCValAssign &VA, SDValue Arg,
9342 ISD::ArgFlagsTy Flags, int CallOffset) {
9343 // FIXME: We should be able to handle this case, but it's not clear how to.
9344 if (Flags.isZExt() || Flags.isSExt())
9345 return true;
9346
9347 for (;;) {
9348 // Look through nodes that don't alter the bits of the incoming value.
9349 unsigned Op = Arg.getOpcode();
9350 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9351 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9352 Arg = Arg.getOperand(0);
9353 continue;
9354 }
9355 break;
9356 }
9357
9358 // If the argument is a load from the same immutable stack slot, we can reuse
9359 // it.
9360 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9361 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9362 const MachineFrameInfo &MFI = MF.getFrameInfo();
9363 int FI = FINode->getIndex();
9364 if (!MFI.isImmutableObjectIndex(FI))
9365 return true;
9366 if (CallOffset != MFI.getObjectOffset(FI))
9367 return true;
9368 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9369 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9370 return true;
9371 return false;
9372 }
9373 }
9374
9375 return true;
9376}
9377
9378/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9379/// and add input and output parameter nodes.
9380SDValue
9381AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9382 SmallVectorImpl<SDValue> &InVals) const {
9383 SelectionDAG &DAG = CLI.DAG;
9384 SDLoc &DL = CLI.DL;
9385 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9386 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9388 SDValue Chain = CLI.Chain;
9389 SDValue Callee = CLI.Callee;
9390 bool &IsTailCall = CLI.IsTailCall;
9391 CallingConv::ID &CallConv = CLI.CallConv;
9392 bool IsVarArg = CLI.IsVarArg;
9393 const CallBase *CB = CLI.CB;
9394
9395 MachineFunction &MF = DAG.getMachineFunction();
9396 MachineFunction::CallSiteInfo CSInfo;
9397 bool IsThisReturn = false;
9398
9399 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9400 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9401 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9402 bool IsSibCall = false;
9403 bool GuardWithBTI = false;
9404
9405 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9406 !Subtarget->noBTIAtReturnTwice()) {
9407 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9408 }
9409
9410 // Analyze operands of the call, assigning locations to each operand.
9412 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9413
9414 if (IsVarArg) {
9415 unsigned NumArgs = Outs.size();
9416
9417 for (unsigned i = 0; i != NumArgs; ++i) {
9418 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9419 report_fatal_error("Passing SVE types to variadic functions is "
9420 "currently not supported");
9421 }
9422 }
9423
9424 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9425
9426 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9427 // Assign locations to each value returned by this call.
9429 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9430 *DAG.getContext());
9431 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9432
9433 // Set type id for call site info.
9434 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9435 CSInfo = MachineFunction::CallSiteInfo(*CB);
9436
9437 // Check callee args/returns for SVE registers and set calling convention
9438 // accordingly.
9439 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9440 auto HasSVERegLoc = [](CCValAssign &Loc) {
9441 if (!Loc.isRegLoc())
9442 return false;
9443 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9444 AArch64::PPRRegClass.contains(Loc.getLocReg());
9445 };
9446 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9448 }
9449
9450 // Determine whether we need any streaming mode changes.
9451 SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
9452
9453 std::optional<unsigned> ZAMarkerNode;
9454 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9455
9456 if (UseNewSMEABILowering) {
9457 if (CallAttrs.requiresLazySave() ||
9458 CallAttrs.requiresPreservingAllZAState())
9459 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9460 else if (CallAttrs.caller().hasZAState() ||
9461 CallAttrs.caller().hasZT0State())
9462 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9463 }
9464
9465 if (IsTailCall) {
9466 // Check if it's really possible to do a tail call.
9467 IsTailCall = isEligibleForTailCallOptimization(CLI);
9468
9469 // A sibling call is one where we're under the usual C ABI and not planning
9470 // to change that but can still do a tail call:
9471 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9472 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9473 IsSibCall = true;
9474
9475 if (IsTailCall)
9476 ++NumTailCalls;
9477 }
9478
9479 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9480 report_fatal_error("failed to perform tail call elimination on a call "
9481 "site marked musttail");
9482
9483 // Get a count of how many bytes are to be pushed on the stack.
9484 unsigned NumBytes = CCInfo.getStackSize();
9485
9486 if (IsSibCall) {
9487 // Since we're not changing the ABI to make this a tail call, the memory
9488 // operands are already available in the caller's incoming argument space.
9489 NumBytes = 0;
9490 }
9491
9492 // FPDiff is the byte offset of the call's argument area from the callee's.
9493 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9494 // by this amount for a tail call. In a sibling call it must be 0 because the
9495 // caller will deallocate the entire stack and the callee still expects its
9496 // arguments to begin at SP+0. Completely unused for non-tail calls.
9497 int FPDiff = 0;
9498
9499 if (IsTailCall && !IsSibCall) {
9500 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9501
9502 // Since callee will pop argument stack as a tail call, we must keep the
9503 // popped size 16-byte aligned.
9504 NumBytes = alignTo(NumBytes, 16);
9505
9506 // FPDiff will be negative if this tail call requires more space than we
9507 // would automatically have in our incoming argument space. Positive if we
9508 // can actually shrink the stack.
9509 FPDiff = NumReusableBytes - NumBytes;
9510
9511 // Update the required reserved area if this is the tail call requiring the
9512 // most argument stack space.
9513 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9514 FuncInfo->setTailCallReservedStack(-FPDiff);
9515
9516 // The stack pointer must be 16-byte aligned at all times it's used for a
9517 // memory operation, which in practice means at *all* times and in
9518 // particular across call boundaries. Therefore our own arguments started at
9519 // a 16-byte aligned SP and the delta applied for the tail call should
9520 // satisfy the same constraint.
9521 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9522 }
9523
9524 auto DescribeCallsite =
9525 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9526 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9527 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9528 R << ore::NV("Callee", ES->getSymbol());
9529 else if (CLI.CB && CLI.CB->getCalledFunction())
9530 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9531 else
9532 R << "unknown callee";
9533 R << "'";
9534 return R;
9535 };
9536
9537 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9538 bool RequiresSaveAllZA =
9539 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9540 if (RequiresLazySave) {
9541 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9542 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9543 TPIDR2.FrameIndex,
9545 Chain = DAG.getNode(
9546 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9547 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9548 TPIDR2ObjAddr);
9549 OptimizationRemarkEmitter ORE(&MF.getFunction());
9550 ORE.emit([&]() {
9551 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9552 CLI.CB)
9553 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9554 &MF.getFunction());
9555 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9556 });
9557 } else if (RequiresSaveAllZA) {
9558 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9559 "Cannot share state that may not exist");
9560 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9561 /*IsSave=*/true);
9562 }
9563
9564 bool RequiresSMChange = CallAttrs.requiresSMChange();
9565 if (RequiresSMChange) {
9566 OptimizationRemarkEmitter ORE(&MF.getFunction());
9567 ORE.emit([&]() {
9568 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9569 CLI.CB)
9570 : OptimizationRemarkAnalysis("sme", "SMETransition",
9571 &MF.getFunction());
9572 DescribeCallsite(R) << " requires a streaming mode transition";
9573 return R;
9574 });
9575 }
9576
9577 SDValue ZTFrameIdx;
9578 MachineFrameInfo &MFI = MF.getFrameInfo();
9579 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9580
9581 // If the caller has ZT0 state which will not be preserved by the callee,
9582 // spill ZT0 before the call.
9583 if (ShouldPreserveZT0) {
9584 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9585
9586 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9587 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9588 }
9589
9590 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9591 // PSTATE.ZA before the call if there is no lazy-save active.
9592 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9593 assert((!DisableZA || !RequiresLazySave) &&
9594 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9595
9596 if (DisableZA)
9597 Chain = DAG.getNode(
9598 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9599 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9600
9601 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9602 // These operations are automatically eliminated by the prolog/epilog pass
9603 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9604 if (!IsSibCall) {
9605 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9606 if (ZAMarkerNode) {
9607 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9608 // using a chain can result in incorrect scheduling. The markers refer to
9609 // the position just before the CALLSEQ_START (though occur after as
9610 // CALLSEQ_START lacks in-glue).
9611 Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
9612 {Chain, Chain.getValue(1)});
9613 }
9614 }
9615
9616 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9618
9620 SmallSet<unsigned, 8> RegsUsed;
9621 SmallVector<SDValue, 8> MemOpChains;
9622 auto PtrVT = getPointerTy(DAG.getDataLayout());
9623
9624 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9625 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9626 for (const auto &F : Forwards) {
9627 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9628 RegsToPass.emplace_back(F.PReg, Val);
9629 }
9630 }
9631
9632 // Walk the register/memloc assignments, inserting copies/loads.
9633 unsigned ExtraArgLocs = 0;
9634 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9635 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9636 SDValue Arg = OutVals[i];
9637 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9638
9639 // Promote the value if needed.
9640 switch (VA.getLocInfo()) {
9641 default:
9642 llvm_unreachable("Unknown loc info!");
9643 case CCValAssign::Full:
9644 break;
9645 case CCValAssign::SExt:
9646 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9647 break;
9648 case CCValAssign::ZExt:
9649 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9650 break;
9651 case CCValAssign::AExt:
9652 if (Outs[i].ArgVT == MVT::i1) {
9653 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9654 //
9655 // Check if we actually have to do this, because the value may
9656 // already be zero-extended.
9657 //
9658 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9659 // and rely on DAGCombiner to fold this, because the following
9660 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9661 //
9662 // (ext (zext x)) -> (zext x)
9663 //
9664 // This will give us (zext i32), which we cannot remove, so
9665 // try to check this beforehand.
9666 if (!checkZExtBool(Arg, DAG)) {
9667 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9668 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9669 }
9670 }
9671 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9672 break;
9674 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9675 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9676 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9677 DAG.getConstant(32, DL, VA.getLocVT()));
9678 break;
9679 case CCValAssign::BCvt:
9680 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9681 break;
9682 case CCValAssign::Trunc:
9683 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9684 break;
9685 case CCValAssign::FPExt:
9686 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9687 break;
9689 bool isScalable = VA.getValVT().isScalableVT();
9690 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9691 "Indirect arguments should be scalable on most subtargets");
9692
9693 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9694 uint64_t PartSize = StoreSize;
9695 unsigned NumParts = 1;
9696 if (Outs[i].Flags.isInConsecutiveRegs()) {
9697 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9698 ++NumParts;
9699 StoreSize *= NumParts;
9700 }
9701
9702 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9703 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9704 MachineFrameInfo &MFI = MF.getFrameInfo();
9705 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9706 if (isScalable) {
9707 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
9708 VA.getValVT().getVectorElementType() == MVT::i1;
9711 }
9712
9713 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9716 SDValue SpillSlot = Ptr;
9717
9718 // Ensure we generate all stores for each tuple part, whilst updating the
9719 // pointer after each store correctly using vscale.
9720 while (NumParts) {
9721 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9722 MemOpChains.push_back(Store);
9723
9724 NumParts--;
9725 if (NumParts > 0) {
9726 SDValue BytesIncrement;
9727 if (isScalable) {
9728 BytesIncrement = DAG.getVScale(
9729 DL, Ptr.getValueType(),
9730 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9731 } else {
9732 BytesIncrement = DAG.getConstant(
9733 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9734 Ptr.getValueType());
9735 }
9736 MPI = MachinePointerInfo(MPI.getAddrSpace());
9737 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9738 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9739 ExtraArgLocs++;
9740 i++;
9741 }
9742 }
9743
9744 Arg = SpillSlot;
9745 break;
9746 }
9747
9748 if (VA.isRegLoc()) {
9749 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9750 Outs[0].VT == MVT::i64) {
9751 assert(VA.getLocVT() == MVT::i64 &&
9752 "unexpected calling convention register assignment");
9753 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9754 "unexpected use of 'returned'");
9755 IsThisReturn = true;
9756 }
9757 if (RegsUsed.count(VA.getLocReg())) {
9758 // If this register has already been used then we're trying to pack
9759 // parts of an [N x i32] into an X-register. The extension type will
9760 // take care of putting the two halves in the right place but we have to
9761 // combine them.
9762 SDValue &Bits =
9763 llvm::find_if(RegsToPass,
9764 [=](const std::pair<unsigned, SDValue> &Elt) {
9765 return Elt.first == VA.getLocReg();
9766 })
9767 ->second;
9768 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9769 // Call site info is used for function's parameter entry value
9770 // tracking. For now we track only simple cases when parameter
9771 // is transferred through whole register.
9773 [&VA](MachineFunction::ArgRegPair ArgReg) {
9774 return ArgReg.Reg == VA.getLocReg();
9775 });
9776 } else {
9777 // Add an extra level of indirection for streaming mode changes by
9778 // using a pseudo copy node that cannot be rematerialised between a
9779 // smstart/smstop and the call by the simple register coalescer.
9780 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9781 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9782 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9783 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9784 RegsUsed.insert(VA.getLocReg());
9785 const TargetOptions &Options = DAG.getTarget().Options;
9786 if (Options.EmitCallSiteInfo)
9787 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9788 }
9789 } else {
9790 assert(VA.isMemLoc());
9791
9792 SDValue DstAddr;
9793 MachinePointerInfo DstInfo;
9794
9795 // FIXME: This works on big-endian for composite byvals, which are the
9796 // common case. It should also work for fundamental types too.
9797 uint32_t BEAlign = 0;
9798 unsigned OpSize;
9799 if (VA.getLocInfo() == CCValAssign::Indirect ||
9801 OpSize = VA.getLocVT().getFixedSizeInBits();
9802 else
9803 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9804 : VA.getValVT().getSizeInBits();
9805 OpSize = (OpSize + 7) / 8;
9806 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9807 !Flags.isInConsecutiveRegs()) {
9808 if (OpSize < 8)
9809 BEAlign = 8 - OpSize;
9810 }
9811 unsigned LocMemOffset = VA.getLocMemOffset();
9812 int32_t Offset = LocMemOffset + BEAlign;
9813
9814 if (IsTailCall) {
9815 // When the frame pointer is perfectly aligned for the tail call and the
9816 // same stack argument is passed down intact, we can reuse it.
9817 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9818 continue;
9819
9820 Offset = Offset + FPDiff;
9821 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9822
9823 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9824 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9825
9826 // Make sure any stack arguments overlapping with where we're storing
9827 // are loaded before this eventual operation. Otherwise they'll be
9828 // clobbered.
9829 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9830 } else {
9831 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9832
9833 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9834 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9835 }
9836
9837 if (Outs[i].Flags.isByVal()) {
9838 SDValue SizeNode =
9839 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9840 SDValue Cpy = DAG.getMemcpy(
9841 Chain, DL, DstAddr, Arg, SizeNode,
9842 Outs[i].Flags.getNonZeroByValAlign(),
9843 /*isVol = */ false, /*AlwaysInline = */ false,
9844 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9845
9846 MemOpChains.push_back(Cpy);
9847 } else {
9848 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9849 // promoted to a legal register type i32, we should truncate Arg back to
9850 // i1/i8/i16.
9851 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9852 VA.getValVT() == MVT::i16)
9853 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9854
9855 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9856 MemOpChains.push_back(Store);
9857 }
9858 }
9859 }
9860
9861 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9862 !(CLI.CB && CLI.CB->isMustTailCall())) {
9863 SDValue ParamPtr = StackPtr;
9864 if (IsTailCall) {
9865 // Create a dummy object at the top of the stack that can be used to get
9866 // the SP after the epilogue
9867 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9868 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9869 }
9870
9871 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9872 // describing the argument list. x4 contains the address of the
9873 // first stack parameter. x5 contains the size in bytes of all parameters
9874 // passed on the stack.
9875 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9876 RegsToPass.emplace_back(AArch64::X5,
9877 DAG.getConstant(NumBytes, DL, MVT::i64));
9878 }
9879
9880 if (!MemOpChains.empty())
9881 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9882
9883 SDValue InGlue;
9884 if (RequiresSMChange) {
9885 bool InsertVectorLengthCheck =
9887 Chain = changeStreamingMode(
9888 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9889 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
9890 InGlue = Chain.getValue(1);
9891 }
9892
9893 // Build a sequence of copy-to-reg nodes chained together with token chain
9894 // and flag operands which copy the outgoing args into the appropriate regs.
9895 for (auto &RegToPass : RegsToPass) {
9896 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9897 RegToPass.second, InGlue);
9898 InGlue = Chain.getValue(1);
9899 }
9900
9901 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9902 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9903 // node so that legalize doesn't hack it.
9904 const GlobalValue *CalledGlobal = nullptr;
9905 unsigned OpFlags = 0;
9906 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9907 CalledGlobal = G->getGlobal();
9908 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9910 if (OpFlags & AArch64II::MO_GOT) {
9911 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9912 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9913 } else {
9914 const GlobalValue *GV = G->getGlobal();
9915 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9916 }
9917 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9918 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9919 Subtarget->isTargetMachO()) ||
9921 const char *Sym = S->getSymbol();
9922 if (UseGot) {
9924 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9925 } else {
9926 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9927 }
9928 }
9929
9930 // We don't usually want to end the call-sequence here because we would tidy
9931 // the frame up *after* the call, however in the ABI-changing tail-call case
9932 // we've carefully laid out the parameters so that when sp is reset they'll be
9933 // in the correct location.
9934 if (IsTailCall && !IsSibCall) {
9935 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9936 InGlue = Chain.getValue(1);
9937 }
9938
9939 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9940
9941 std::vector<SDValue> Ops;
9942 Ops.push_back(Chain);
9943 Ops.push_back(Callee);
9944
9945 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9946 // be expanded to the call, directly followed by a special marker sequence and
9947 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9948 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9949 assert(!IsTailCall &&
9950 "tail calls cannot be marked with clang.arc.attachedcall");
9951 Opc = AArch64ISD::CALL_RVMARKER;
9952
9953 // Add a target global address for the retainRV/claimRV runtime function
9954 // just before the call target.
9955 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9956 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9957 Ops.insert(Ops.begin() + 1, GA);
9958
9959 // We may or may not need to emit both the marker and the retain/claim call.
9960 // Tell the pseudo expansion using an additional boolean op.
9961 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9962 SDValue DoEmitMarker =
9963 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9964 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9965 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9966 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9967 } else if (GuardWithBTI) {
9968 Opc = AArch64ISD::CALL_BTI;
9969 }
9970
9971 if (IsTailCall) {
9972 // Each tail call may have to adjust the stack by a different amount, so
9973 // this information must travel along with the operation for eventual
9974 // consumption by emitEpilogue.
9975 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9976 }
9977
9978 if (CLI.PAI) {
9979 const uint64_t Key = CLI.PAI->Key;
9981 "Invalid auth call key");
9982
9983 // Split the discriminator into address/integer components.
9984 SDValue AddrDisc, IntDisc;
9985 std::tie(IntDisc, AddrDisc) =
9986 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9987
9988 if (Opc == AArch64ISD::CALL_RVMARKER)
9989 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9990 else
9991 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9992 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9993 Ops.push_back(IntDisc);
9994 Ops.push_back(AddrDisc);
9995 }
9996
9997 // Add argument registers to the end of the list so that they are known live
9998 // into the call.
9999 for (auto &RegToPass : RegsToPass)
10000 Ops.push_back(DAG.getRegister(RegToPass.first,
10001 RegToPass.second.getValueType()));
10002
10003 // Add a register mask operand representing the call-preserved registers.
10004 const uint32_t *Mask;
10005 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10006 if (IsThisReturn) {
10007 // For 'this' returns, use the X0-preserving mask if applicable
10008 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10009 if (!Mask) {
10010 IsThisReturn = false;
10011 Mask = TRI->getCallPreservedMask(MF, CallConv);
10012 }
10013 } else
10014 Mask = TRI->getCallPreservedMask(MF, CallConv);
10015
10016 if (Subtarget->hasCustomCallingConv())
10017 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10018
10019 if (TRI->isAnyArgRegReserved(MF))
10020 TRI->emitReservedArgRegCallError(MF);
10021
10022 assert(Mask && "Missing call preserved mask for calling convention");
10023 Ops.push_back(DAG.getRegisterMask(Mask));
10024
10025 if (InGlue.getNode())
10026 Ops.push_back(InGlue);
10027
10028 // If we're doing a tall call, use a TC_RETURN here rather than an
10029 // actual call instruction.
10030 if (IsTailCall) {
10032 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10033 if (IsCFICall)
10034 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10035
10036 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10037 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10038 if (CalledGlobal &&
10039 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10040 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10041 return Ret;
10042 }
10043
10044 // Returns a chain and a flag for retval copy to use.
10045 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10046 if (IsCFICall)
10047 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10048
10049 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10050 InGlue = Chain.getValue(1);
10051 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10052 if (CalledGlobal &&
10053 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10054 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10055
10056 uint64_t CalleePopBytes =
10057 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10058
10059 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10060 InGlue = Chain.getValue(1);
10061
10062 // Handle result values, copying them out of physregs into vregs that we
10063 // return.
10064 SDValue Result = LowerCallResult(
10065 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10066 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10067
10068 if (!Ins.empty())
10069 InGlue = Result.getValue(Result->getNumValues() - 1);
10070
10071 if (RequiresSMChange) {
10073 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10074 getSMToggleCondition(CallAttrs));
10075 }
10076
10077 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
10078 // Unconditionally resume ZA.
10079 Result = DAG.getNode(
10080 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10081 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10082
10083 if (ShouldPreserveZT0)
10084 Result =
10085 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10086 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10087
10088 if (RequiresLazySave) {
10089 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10090 } else if (RequiresSaveAllZA) {
10091 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10092 /*IsSave=*/false);
10093 }
10094
10095 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10096 RequiresSaveAllZA) {
10097 for (unsigned I = 0; I < InVals.size(); ++I) {
10098 // The smstart/smstop is chained as part of the call, but when the
10099 // resulting chain is discarded (which happens when the call is not part
10100 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10101 // smstart/smstop is chained to the result value. We can do that by doing
10102 // a vreg -> vreg copy.
10105 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10106 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10107 InVals[I].getValueType());
10108 }
10109 }
10110
10111 if (CallConv == CallingConv::PreserveNone) {
10112 for (const ISD::OutputArg &O : Outs) {
10113 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10114 O.Flags.isSwiftAsync()) {
10115 MachineFunction &MF = DAG.getMachineFunction();
10116 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10117 MF.getFunction(),
10118 "Swift attributes can't be used with preserve_none",
10119 DL.getDebugLoc()));
10120 break;
10121 }
10122 }
10123 }
10124
10125 return Result;
10126}
10127
10128bool AArch64TargetLowering::CanLowerReturn(
10129 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10131 const Type *RetTy) const {
10132 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10134 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10135 return CCInfo.CheckReturn(Outs, RetCC);
10136}
10137
10138SDValue
10139AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10140 bool isVarArg,
10142 const SmallVectorImpl<SDValue> &OutVals,
10143 const SDLoc &DL, SelectionDAG &DAG) const {
10144 auto &MF = DAG.getMachineFunction();
10145 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10146
10147 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10149 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10150 CCInfo.AnalyzeReturn(Outs, RetCC);
10151
10152 // Copy the result values into the output registers.
10153 SDValue Glue;
10155 SmallSet<unsigned, 4> RegsUsed;
10156 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10157 ++i, ++realRVLocIdx) {
10158 CCValAssign &VA = RVLocs[i];
10159 assert(VA.isRegLoc() && "Can only return in registers!");
10160 SDValue Arg = OutVals[realRVLocIdx];
10161
10162 switch (VA.getLocInfo()) {
10163 default:
10164 llvm_unreachable("Unknown loc info!");
10165 case CCValAssign::Full:
10166 if (Outs[i].ArgVT == MVT::i1) {
10167 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10168 // value. This is strictly redundant on Darwin (which uses "zeroext
10169 // i1"), but will be optimised out before ISel.
10170 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10171 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10172 }
10173 break;
10174 case CCValAssign::BCvt:
10175 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10176 break;
10177 case CCValAssign::AExt:
10178 case CCValAssign::ZExt:
10179 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10180 break;
10182 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10183 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10184 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10185 DAG.getConstant(32, DL, VA.getLocVT()));
10186 break;
10187 }
10188
10189 if (RegsUsed.count(VA.getLocReg())) {
10190 SDValue &Bits =
10191 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10192 return Elt.first == VA.getLocReg();
10193 })->second;
10194 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10195 } else {
10196 RetVals.emplace_back(VA.getLocReg(), Arg);
10197 RegsUsed.insert(VA.getLocReg());
10198 }
10199 }
10200
10201 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10202
10203 // Emit SMSTOP before returning from a locally streaming function
10204 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10205 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10206 if (FuncAttrs.hasStreamingCompatibleInterface())
10207 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10208 /*Glue*/ SDValue(),
10210 else
10211 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10212 /*Glue*/ SDValue(), AArch64SME::Always);
10213 Glue = Chain.getValue(1);
10214 }
10215
10216 SmallVector<SDValue, 4> RetOps(1, Chain);
10217 for (auto &RetVal : RetVals) {
10218 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10219 isPassedInFPR(RetVal.second.getValueType()))
10220 RetVal.second =
10221 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10222 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10223 RetVal.second);
10224 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10225 Glue = Chain.getValue(1);
10226 RetOps.push_back(
10227 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10228 }
10229
10230 // Windows AArch64 ABIs require that for returning structs by value we copy
10231 // the sret argument into X0 for the return.
10232 // We saved the argument into a virtual register in the entry block,
10233 // so now we copy the value out and into X0.
10234 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10235 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10237
10238 unsigned RetValReg = AArch64::X0;
10239 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10240 RetValReg = AArch64::X8;
10241 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10242 Glue = Chain.getValue(1);
10243
10244 RetOps.push_back(
10245 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10246 }
10247
10248 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10249 if (I) {
10250 for (; *I; ++I) {
10251 if (AArch64::GPR64RegClass.contains(*I))
10252 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10253 else if (AArch64::FPR64RegClass.contains(*I))
10254 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10255 else
10256 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10257 }
10258 }
10259
10260 RetOps[0] = Chain; // Update chain.
10261
10262 // Add the glue if we have it.
10263 if (Glue.getNode())
10264 RetOps.push_back(Glue);
10265
10266 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10267 // ARM64EC entry thunks use a special return sequence: instead of a regular
10268 // "ret" instruction, they need to explicitly call the emulator.
10269 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10270 SDValue Arm64ECRetDest =
10271 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10272 Arm64ECRetDest =
10273 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10274 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10275 MachinePointerInfo());
10276 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10277 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10278 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10279 }
10280
10281 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10282}
10283
10284//===----------------------------------------------------------------------===//
10285// Other Lowering Code
10286//===----------------------------------------------------------------------===//
10287
10288SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10289 SelectionDAG &DAG,
10290 unsigned Flag) const {
10291 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10292 N->getOffset(), Flag);
10293}
10294
10295SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10296 SelectionDAG &DAG,
10297 unsigned Flag) const {
10298 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10299}
10300
10301SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10302 SelectionDAG &DAG,
10303 unsigned Flag) const {
10304 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10305 N->getOffset(), Flag);
10306}
10307
10308SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10309 SelectionDAG &DAG,
10310 unsigned Flag) const {
10311 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10312}
10313
10314SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10315 SelectionDAG &DAG,
10316 unsigned Flag) const {
10317 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10318}
10319
10320// (loadGOT sym)
10321template <class NodeTy>
10322SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10323 unsigned Flags) const {
10324 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10325 SDLoc DL(N);
10326 EVT Ty = getPointerTy(DAG.getDataLayout());
10327 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10328 // FIXME: Once remat is capable of dealing with instructions with register
10329 // operands, expand this into two nodes instead of using a wrapper node.
10330 if (DAG.getMachineFunction()
10331 .getInfo<AArch64FunctionInfo>()
10332 ->hasELFSignedGOT())
10333 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10334 0);
10335 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10336}
10337
10338// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10339template <class NodeTy>
10340SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10341 unsigned Flags) const {
10342 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10343 SDLoc DL(N);
10344 EVT Ty = getPointerTy(DAG.getDataLayout());
10345 const unsigned char MO_NC = AArch64II::MO_NC;
10346 return DAG.getNode(
10347 AArch64ISD::WrapperLarge, DL, Ty,
10348 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10349 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10350 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10351 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10352}
10353
10354// (addlow (adrp %hi(sym)) %lo(sym))
10355template <class NodeTy>
10356SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10357 unsigned Flags) const {
10358 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10359 SDLoc DL(N);
10360 EVT Ty = getPointerTy(DAG.getDataLayout());
10361 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10362 SDValue Lo = getTargetNode(N, Ty, DAG,
10364 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10365 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10366}
10367
10368// (adr sym)
10369template <class NodeTy>
10370SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10371 unsigned Flags) const {
10372 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10373 SDLoc DL(N);
10374 EVT Ty = getPointerTy(DAG.getDataLayout());
10375 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10376 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10377}
10378
10379SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10380 SelectionDAG &DAG) const {
10381 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10382 const GlobalValue *GV = GN->getGlobal();
10383 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10384
10385 if (OpFlags != AArch64II::MO_NO_FLAG)
10387 "unexpected offset in global node");
10388
10389 // This also catches the large code model case for Darwin, and tiny code
10390 // model with got relocations.
10391 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10392 return getGOT(GN, DAG, OpFlags);
10393 }
10394
10398 Result = getAddrLarge(GN, DAG, OpFlags);
10399 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10400 Result = getAddrTiny(GN, DAG, OpFlags);
10401 } else {
10402 Result = getAddr(GN, DAG, OpFlags);
10403 }
10404 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10405 SDLoc DL(GN);
10407 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10409 return Result;
10410}
10411
10412/// Convert a TLS address reference into the correct sequence of loads
10413/// and calls to compute the variable's address (for Darwin, currently) and
10414/// return an SDValue containing the final node.
10415
10416/// Darwin only has one TLS scheme which must be capable of dealing with the
10417/// fully general situation, in the worst case. This means:
10418/// + "extern __thread" declaration.
10419/// + Defined in a possibly unknown dynamic library.
10420///
10421/// The general system is that each __thread variable has a [3 x i64] descriptor
10422/// which contains information used by the runtime to calculate the address. The
10423/// only part of this the compiler needs to know about is the first xword, which
10424/// contains a function pointer that must be called with the address of the
10425/// entire descriptor in "x0".
10426///
10427/// Since this descriptor may be in a different unit, in general even the
10428/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10429/// is:
10430/// adrp x0, _var@TLVPPAGE
10431/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10432/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10433/// ; the function pointer
10434/// blr x1 ; Uses descriptor address in x0
10435/// ; Address of _var is now in x0.
10436///
10437/// If the address of _var's descriptor *is* known to the linker, then it can
10438/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10439/// a slight efficiency gain.
10440SDValue
10441AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10442 SelectionDAG &DAG) const {
10443 assert(Subtarget->isTargetDarwin() &&
10444 "This function expects a Darwin target");
10445
10446 SDLoc DL(Op);
10447 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10448 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10449 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10450
10451 SDValue TLVPAddr =
10452 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10453 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10454
10455 // The first entry in the descriptor is a function pointer that we must call
10456 // to obtain the address of the variable.
10457 SDValue Chain = DAG.getEntryNode();
10458 SDValue FuncTLVGet = DAG.getLoad(
10459 PtrMemVT, DL, Chain, DescAddr,
10461 Align(PtrMemVT.getSizeInBits() / 8),
10463 Chain = FuncTLVGet.getValue(1);
10464
10465 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10466 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10467
10468 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10469 MFI.setAdjustsStack(true);
10470
10471 // TLS calls preserve all registers except those that absolutely must be
10472 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10473 // silly).
10474 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10475 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10476 if (Subtarget->hasCustomCallingConv())
10477 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10478
10479 // Finally, we can make the call. This is just a degenerate version of a
10480 // normal AArch64 call node: x0 takes the address of the descriptor, and
10481 // returns the address of the variable in this thread.
10482 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10483
10484 unsigned Opcode = AArch64ISD::CALL;
10486 Ops.push_back(Chain);
10487 Ops.push_back(FuncTLVGet);
10488
10489 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10490 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10491 Opcode = AArch64ISD::AUTH_CALL;
10492 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10493 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10494 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10495 }
10496
10497 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10498 Ops.push_back(DAG.getRegisterMask(Mask));
10499 Ops.push_back(Chain.getValue(1));
10500 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10501 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10502}
10503
10504/// Convert a thread-local variable reference into a sequence of instructions to
10505/// compute the variable's address for the local exec TLS model of ELF targets.
10506/// The sequence depends on the maximum TLS area size.
10507SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10508 SDValue ThreadBase,
10509 const SDLoc &DL,
10510 SelectionDAG &DAG) const {
10511 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10512 SDValue TPOff, Addr;
10513
10514 switch (DAG.getTarget().Options.TLSSize) {
10515 default:
10516 llvm_unreachable("Unexpected TLS size");
10517
10518 case 12: {
10519 // mrs x0, TPIDR_EL0
10520 // add x0, x0, :tprel_lo12:a
10522 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10523 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10524 Var,
10525 DAG.getTargetConstant(0, DL, MVT::i32)),
10526 0);
10527 }
10528
10529 case 24: {
10530 // mrs x0, TPIDR_EL0
10531 // add x0, x0, :tprel_hi12:a
10532 // add x0, x0, :tprel_lo12_nc:a
10533 SDValue HiVar = DAG.getTargetGlobalAddress(
10534 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10535 SDValue LoVar = DAG.getTargetGlobalAddress(
10536 GV, DL, PtrVT, 0,
10538 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10539 HiVar,
10540 DAG.getTargetConstant(0, DL, MVT::i32)),
10541 0);
10542 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10543 LoVar,
10544 DAG.getTargetConstant(0, DL, MVT::i32)),
10545 0);
10546 }
10547
10548 case 32: {
10549 // mrs x1, TPIDR_EL0
10550 // movz x0, #:tprel_g1:a
10551 // movk x0, #:tprel_g0_nc:a
10552 // add x0, x1, x0
10553 SDValue HiVar = DAG.getTargetGlobalAddress(
10554 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10555 SDValue LoVar = DAG.getTargetGlobalAddress(
10556 GV, DL, PtrVT, 0,
10558 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10559 DAG.getTargetConstant(16, DL, MVT::i32)),
10560 0);
10561 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10562 DAG.getTargetConstant(0, DL, MVT::i32)),
10563 0);
10564 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10565 }
10566
10567 case 48: {
10568 // mrs x1, TPIDR_EL0
10569 // movz x0, #:tprel_g2:a
10570 // movk x0, #:tprel_g1_nc:a
10571 // movk x0, #:tprel_g0_nc:a
10572 // add x0, x1, x0
10573 SDValue HiVar = DAG.getTargetGlobalAddress(
10574 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10575 SDValue MiVar = DAG.getTargetGlobalAddress(
10576 GV, DL, PtrVT, 0,
10578 SDValue LoVar = DAG.getTargetGlobalAddress(
10579 GV, DL, PtrVT, 0,
10581 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10582 DAG.getTargetConstant(32, DL, MVT::i32)),
10583 0);
10584 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10585 DAG.getTargetConstant(16, DL, MVT::i32)),
10586 0);
10587 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10588 DAG.getTargetConstant(0, DL, MVT::i32)),
10589 0);
10590 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10591 }
10592 }
10593}
10594
10595/// When accessing thread-local variables under either the general-dynamic or
10596/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10597/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10598/// is a function pointer to carry out the resolution.
10599///
10600/// The sequence is:
10601/// adrp x0, :tlsdesc:var
10602/// ldr x1, [x0, #:tlsdesc_lo12:var]
10603/// add x0, x0, #:tlsdesc_lo12:var
10604/// .tlsdesccall var
10605/// blr x1
10606/// (TPIDR_EL0 offset now in x0)
10607///
10608/// The above sequence must be produced unscheduled, to enable the linker to
10609/// optimize/relax this sequence.
10610/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10611/// above sequence, and expanded really late in the compilation flow, to ensure
10612/// the sequence is produced as per above.
10613SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10614 const SDLoc &DL,
10615 SelectionDAG &DAG) const {
10616 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10617
10618 SDValue Chain = DAG.getEntryNode();
10619 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10620
10621 unsigned Opcode =
10622 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10623 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10624 : AArch64ISD::TLSDESC_CALLSEQ;
10625 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10626 SDValue Glue = Chain.getValue(1);
10627
10628 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10629}
10630
10631SDValue
10632AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10633 SelectionDAG &DAG) const {
10634 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10635
10636 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10637 AArch64FunctionInfo *MFI =
10638 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10639
10643
10645 if (Model == TLSModel::LocalDynamic)
10647 }
10648
10650 Model != TLSModel::LocalExec)
10651 report_fatal_error("ELF TLS only supported in small memory model or "
10652 "in local exec TLS model");
10653 // Different choices can be made for the maximum size of the TLS area for a
10654 // module. For the small address model, the default TLS size is 16MiB and the
10655 // maximum TLS size is 4GiB.
10656 // FIXME: add tiny and large code model support for TLS access models other
10657 // than local exec. We currently generate the same code as small for tiny,
10658 // which may be larger than needed.
10659
10660 SDValue TPOff;
10661 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10662 SDLoc DL(Op);
10663 const GlobalValue *GV = GA->getGlobal();
10664
10665 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10666
10667 if (Model == TLSModel::LocalExec) {
10668 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10669 } else if (Model == TLSModel::InitialExec) {
10670 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10671 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10672 } else if (Model == TLSModel::LocalDynamic) {
10673 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10674 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10675 // the beginning of the module's TLS region, followed by a DTPREL offset
10676 // calculation.
10677
10678 // These accesses will need deduplicating if there's more than one.
10680
10681 // The call needs a relocation too for linker relaxation. It doesn't make
10682 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10683 // the address.
10684 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10686
10687 // Now we can calculate the offset from TPIDR_EL0 to this module's
10688 // thread-local area.
10689 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10690
10691 // Now use :dtprel_whatever: operations to calculate this variable's offset
10692 // in its thread-storage area.
10693 SDValue HiVar = DAG.getTargetGlobalAddress(
10694 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10695 SDValue LoVar = DAG.getTargetGlobalAddress(
10696 GV, DL, MVT::i64, 0,
10698
10699 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10700 DAG.getTargetConstant(0, DL, MVT::i32)),
10701 0);
10702 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10703 DAG.getTargetConstant(0, DL, MVT::i32)),
10704 0);
10705 } else if (Model == TLSModel::GeneralDynamic) {
10706 // The call needs a relocation too for linker relaxation. It doesn't make
10707 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10708 // the address.
10709 SDValue SymAddr =
10710 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10711
10712 // Finally we can make a call to calculate the offset from tpidr_el0.
10713 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10714 } else
10715 llvm_unreachable("Unsupported ELF TLS access model");
10716
10717 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10718}
10719
10720SDValue
10721AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10722 SelectionDAG &DAG) const {
10723 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10724
10725 SDValue Chain = DAG.getEntryNode();
10726 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10727 SDLoc DL(Op);
10728
10729 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10730
10731 // Load the ThreadLocalStoragePointer from the TEB
10732 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10733 SDValue TLSArray =
10734 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10735 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10736 Chain = TLSArray.getValue(1);
10737
10738 // Load the TLS index from the C runtime;
10739 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10740 // This also does the same as LOADgot, but using a generic i32 load,
10741 // while LOADgot only loads i64.
10742 SDValue TLSIndexHi =
10743 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10744 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10745 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10746 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10747 SDValue TLSIndex =
10748 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10749 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10750 Chain = TLSIndex.getValue(1);
10751
10752 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10753 // offset into the TLSArray.
10754 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10755 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10756 DAG.getConstant(3, DL, PtrVT));
10757 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10758 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10759 MachinePointerInfo());
10760 Chain = TLS.getValue(1);
10761
10762 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10763 const GlobalValue *GV = GA->getGlobal();
10764 SDValue TGAHi = DAG.getTargetGlobalAddress(
10765 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10766 SDValue TGALo = DAG.getTargetGlobalAddress(
10767 GV, DL, PtrVT, 0,
10769
10770 // Add the offset from the start of the .tls section (section base).
10771 SDValue Addr =
10772 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10773 DAG.getTargetConstant(0, DL, MVT::i32)),
10774 0);
10775 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10776 return Addr;
10777}
10778
10779SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10780 SelectionDAG &DAG) const {
10781 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10782 if (DAG.getTarget().useEmulatedTLS())
10783 return LowerToTLSEmulatedModel(GA, DAG);
10784
10785 if (Subtarget->isTargetDarwin())
10786 return LowerDarwinGlobalTLSAddress(Op, DAG);
10787 if (Subtarget->isTargetELF())
10788 return LowerELFGlobalTLSAddress(Op, DAG);
10789 if (Subtarget->isTargetWindows())
10790 return LowerWindowsGlobalTLSAddress(Op, DAG);
10791
10792 llvm_unreachable("Unexpected platform trying to use TLS");
10793}
10794
10795//===----------------------------------------------------------------------===//
10796// PtrAuthGlobalAddress lowering
10797//
10798// We have 3 lowering alternatives to choose from:
10799// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10800// If the GV doesn't need a GOT load (i.e., is locally defined)
10801// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10802//
10803// - LOADgotPAC: similar to LOADgot, with added PAC.
10804// If the GV needs a GOT load, materialize the pointer using the usual
10805// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10806// section is assumed to be read-only (for example, via relro mechanism). See
10807// LowerMOVaddrPAC.
10808//
10809// - LOADauthptrstatic: similar to LOADgot, but use a
10810// special stub slot instead of a GOT slot.
10811// Load a signed pointer for symbol 'sym' from a stub slot named
10812// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10813// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10814// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10815//
10816// All 3 are pseudos that are expand late to longer sequences: this lets us
10817// provide integrity guarantees on the to-be-signed intermediate values.
10818//
10819// LOADauthptrstatic is undesirable because it requires a large section filled
10820// with often similarly-signed pointers, making it a good harvesting target.
10821// Thus, it's only used for ptrauth references to extern_weak to avoid null
10822// checks.
10823
10825 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10826 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10827 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10828 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10829
10830 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10831 // offset alone as a pointer if the symbol wasn't available, which would
10832 // probably break null checks in users. Ptrauth complicates things further:
10833 // error out.
10834 if (TGN->getOffset() != 0)
10836 "unsupported non-zero offset in weak ptrauth global reference");
10837
10838 if (!isNullConstant(AddrDiscriminator))
10839 report_fatal_error("unsupported weak addr-div ptrauth global");
10840
10841 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10842 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10843 {TGA, Key, Discriminator}),
10844 0);
10845}
10846
10847SDValue
10848AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10849 SelectionDAG &DAG) const {
10850 SDValue Ptr = Op.getOperand(0);
10851 uint64_t KeyC = Op.getConstantOperandVal(1);
10852 SDValue AddrDiscriminator = Op.getOperand(2);
10853 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10854 EVT VT = Op.getValueType();
10855 SDLoc DL(Op);
10856
10857 if (KeyC > AArch64PACKey::LAST)
10858 report_fatal_error("key in ptrauth global out of range [0, " +
10859 Twine((int)AArch64PACKey::LAST) + "]");
10860
10861 // Blend only works if the integer discriminator is 16-bit wide.
10862 if (!isUInt<16>(DiscriminatorC))
10864 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10865
10866 // Choosing between 3 lowering alternatives is target-specific.
10867 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10868 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10869
10870 int64_t PtrOffsetC = 0;
10871 if (Ptr.getOpcode() == ISD::ADD) {
10872 PtrOffsetC = Ptr.getConstantOperandVal(1);
10873 Ptr = Ptr.getOperand(0);
10874 }
10875 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10876 const GlobalValue *PtrGV = PtrN->getGlobal();
10877
10878 // Classify the reference to determine whether it needs a GOT load.
10879 const unsigned OpFlags =
10880 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10881 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10882 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10883 "unsupported non-GOT op flags on ptrauth global reference");
10884
10885 // Fold any offset into the GV; our pseudos expect it there.
10886 PtrOffsetC += PtrN->getOffset();
10887 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10888 /*TargetFlags=*/0);
10889 assert(PtrN->getTargetFlags() == 0 &&
10890 "unsupported target flags on ptrauth global");
10891
10892 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10893 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10894 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10895 ? AddrDiscriminator
10896 : DAG.getRegister(AArch64::XZR, MVT::i64);
10897
10898 // No GOT load needed -> MOVaddrPAC
10899 if (!NeedsGOTLoad) {
10900 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10901 return SDValue(
10902 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10903 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10904 0);
10905 }
10906
10907 // GOT load -> LOADgotPAC
10908 // Note that we disallow extern_weak refs to avoid null checks later.
10909 if (!PtrGV->hasExternalWeakLinkage())
10910 return SDValue(
10911 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10912 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10913 0);
10914
10915 // extern_weak ref -> LOADauthptrstatic
10917 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10918 DAG);
10919}
10920
10921// Looks through \param Val to determine the bit that can be used to
10922// check the sign of the value. It returns the unextended value and
10923// the sign bit position.
10924std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10925 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10926 return {Val.getOperand(0),
10927 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10928 1};
10929
10930 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10931 return {Val.getOperand(0),
10932 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10933
10934 return {Val, Val.getValueSizeInBits() - 1};
10935}
10936
10937SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10938 SDValue Chain = Op.getOperand(0);
10939 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10940 SDValue LHS = Op.getOperand(2);
10941 SDValue RHS = Op.getOperand(3);
10942 SDValue Dest = Op.getOperand(4);
10943 SDLoc DL(Op);
10944
10945 MachineFunction &MF = DAG.getMachineFunction();
10946 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10947 // will not be produced, as they are conditional branch instructions that do
10948 // not set flags.
10949 bool ProduceNonFlagSettingCondBr =
10950 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10951
10952 // Handle f128 first, since lowering it will result in comparing the return
10953 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10954 // is expecting to deal with.
10955 if (LHS.getValueType() == MVT::f128) {
10956 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10957
10958 // If softenSetCCOperands returned a scalar, we need to compare the result
10959 // against zero to select between true and false values.
10960 if (!RHS.getNode()) {
10961 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10962 CC = ISD::SETNE;
10963 }
10964 }
10965
10966 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10967 // instruction.
10969 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10970 // Only lower legal XALUO ops.
10971 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10972 return SDValue();
10973
10974 // The actual operation with overflow check.
10976 SDValue Value, Overflow;
10977 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10978
10979 if (CC == ISD::SETNE)
10980 OFCC = getInvertedCondCode(OFCC);
10981 SDValue CCVal = getCondCode(DAG, OFCC);
10982
10983 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10984 Overflow);
10985 }
10986
10987 if (LHS.getValueType().isInteger()) {
10988 assert((LHS.getValueType() == RHS.getValueType()) &&
10989 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10990
10991 // If the RHS of the comparison is zero, we can potentially fold this
10992 // to a specialized branch.
10993 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10994 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10995 if (CC == ISD::SETEQ) {
10996 // See if we can use a TBZ to fold in an AND as well.
10997 // TBZ has a smaller branch displacement than CBZ. If the offset is
10998 // out of bounds, a late MI-layer pass rewrites branches.
10999 // 403.gcc is an example that hits this case.
11000 if (LHS.getOpcode() == ISD::AND &&
11001 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11002 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11003 SDValue Test = LHS.getOperand(0);
11004 uint64_t Mask = LHS.getConstantOperandVal(1);
11005 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
11006 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11007 Dest);
11008 }
11009
11010 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11011 } else if (CC == ISD::SETNE) {
11012 // See if we can use a TBZ to fold in an AND as well.
11013 // TBZ has a smaller branch displacement than CBZ. If the offset is
11014 // out of bounds, a late MI-layer pass rewrites branches.
11015 // 403.gcc is an example that hits this case.
11016 if (LHS.getOpcode() == ISD::AND &&
11017 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11018 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11019 SDValue Test = LHS.getOperand(0);
11020 uint64_t Mask = LHS.getConstantOperandVal(1);
11021 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
11022 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11023 Dest);
11024 }
11025
11026 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11027 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11028 // Don't combine AND since emitComparison converts the AND to an ANDS
11029 // (a.k.a. TST) and the test in the test bit and branch instruction
11030 // becomes redundant. This would also increase register pressure.
11031 uint64_t SignBitPos;
11032 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11033 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11034 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11035 }
11036 }
11037 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11038 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11039 // Don't combine AND since emitComparison converts the AND to an ANDS
11040 // (a.k.a. TST) and the test in the test bit and branch instruction
11041 // becomes redundant. This would also increase register pressure.
11042 uint64_t SignBitPos;
11043 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11044 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11045 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11046 }
11047
11048 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11049 // larger branch displacement but do prefer CB over cmp + br.
11050 if (Subtarget->hasCMPBR() &&
11052 ProduceNonFlagSettingCondBr) {
11053 SDValue Cond =
11055 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11056 Dest);
11057 }
11058
11059 SDValue CCVal;
11060 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11061 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11062 Cmp);
11063 }
11064
11065 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11066 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11067
11068 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11069 // clean. Some of them require two branches to implement.
11070 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11071 AArch64CC::CondCode CC1, CC2;
11072 changeFPCCToAArch64CC(CC, CC1, CC2);
11073 SDValue CC1Val = getCondCode(DAG, CC1);
11074 SDValue BR1 =
11075 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11076 if (CC2 != AArch64CC::AL) {
11077 SDValue CC2Val = getCondCode(DAG, CC2);
11078 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11079 Cmp);
11080 }
11081
11082 return BR1;
11083}
11084
11085SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11086 SelectionDAG &DAG) const {
11087 if (!Subtarget->isNeonAvailable() &&
11088 !Subtarget->useSVEForFixedLengthVectors())
11089 return SDValue();
11090
11091 EVT VT = Op.getValueType();
11092 EVT IntVT = VT.changeTypeToInteger();
11093 SDLoc DL(Op);
11094
11095 SDValue In1 = Op.getOperand(0);
11096 SDValue In2 = Op.getOperand(1);
11097 EVT SrcVT = In2.getValueType();
11098
11099 if (!SrcVT.bitsEq(VT))
11100 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11101
11102 if (VT.isScalableVector())
11103 IntVT =
11105
11106 if (VT.isFixedLengthVector() &&
11107 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11108 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11109
11110 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11111 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11112
11113 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11114 return convertFromScalableVector(DAG, VT, Res);
11115 }
11116
11117 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11118 // a SVE FCOPYSIGN.
11119 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11120 Subtarget->isSVEorStreamingSVEAvailable()) {
11121 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11122 return SDValue();
11123 EVT SVT = getPackedSVEVectorVT(VT);
11124
11125 SDValue Ins1 =
11126 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11127 DAG.getConstant(0, DL, MVT::i64));
11128 SDValue Ins2 =
11129 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11130 DAG.getConstant(0, DL, MVT::i64));
11131 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11132 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11133 DAG.getConstant(0, DL, MVT::i64));
11134 }
11135
11136 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11137 if (VT.isScalableVector())
11138 return getSVESafeBitCast(VT, Op, DAG);
11139
11140 return DAG.getBitcast(VT, Op);
11141 };
11142
11143 SDValue VecVal1, VecVal2;
11144 EVT VecVT;
11145 auto SetVecVal = [&](int Idx = -1) {
11146 if (!VT.isVector()) {
11147 VecVal1 =
11148 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11149 VecVal2 =
11150 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11151 } else {
11152 VecVal1 = BitCast(VecVT, In1, DAG);
11153 VecVal2 = BitCast(VecVT, In2, DAG);
11154 }
11155 };
11156 if (VT.isVector()) {
11157 VecVT = IntVT;
11158 SetVecVal();
11159 } else if (VT == MVT::f64) {
11160 VecVT = MVT::v2i64;
11161 SetVecVal(AArch64::dsub);
11162 } else if (VT == MVT::f32) {
11163 VecVT = MVT::v4i32;
11164 SetVecVal(AArch64::ssub);
11165 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11166 VecVT = MVT::v8i16;
11167 SetVecVal(AArch64::hsub);
11168 } else {
11169 llvm_unreachable("Invalid type for copysign!");
11170 }
11171
11172 unsigned BitWidth = In1.getScalarValueSizeInBits();
11173 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11174
11175 // We want to materialize a mask with every bit but the high bit set, but the
11176 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11177 // 64-bit elements. Instead, materialize all bits set and then negate that.
11178 if (VT == MVT::f64 || VT == MVT::v2f64) {
11179 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11180 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11181 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11182 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11183 }
11184
11185 SDValue BSP =
11186 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11187 if (VT == MVT::f16 || VT == MVT::bf16)
11188 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11189 if (VT == MVT::f32)
11190 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11191 if (VT == MVT::f64)
11192 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11193
11194 return BitCast(VT, BSP, DAG);
11195}
11196
11197SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11198 SelectionDAG &DAG) const {
11200 Attribute::NoImplicitFloat))
11201 return SDValue();
11202
11203 EVT VT = Op.getValueType();
11204 if (VT.isScalableVector() ||
11205 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11206 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11207
11208 bool IsParity = Op.getOpcode() == ISD::PARITY;
11209 SDValue Val = Op.getOperand(0);
11210 SDLoc DL(Op);
11211
11212 // for i32, general parity function using EORs is more efficient compared to
11213 // using floating point
11214 if (VT == MVT::i32 && IsParity)
11215 return SDValue();
11216
11217 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11218 if (VT == MVT::i32 || VT == MVT::i64) {
11219 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11220 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11221 DAG.getUNDEF(ContainerVT), Val,
11222 DAG.getVectorIdxConstant(0, DL));
11223 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11224 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11225 DAG.getVectorIdxConstant(0, DL));
11226 if (IsParity)
11227 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11228 return Val;
11229 }
11230
11231 if (VT == MVT::i128) {
11232 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11233 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11234 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11235 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11236 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11237 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11238 if (IsParity)
11239 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11240 return Val;
11241 }
11242 }
11243
11244 if (!Subtarget->isNeonAvailable())
11245 return SDValue();
11246
11247 // If there is no CNT instruction available, GPR popcount can
11248 // be more efficiently lowered to the following sequence that uses
11249 // AdvSIMD registers/instructions as long as the copies to/from
11250 // the AdvSIMD registers are cheap.
11251 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11252 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11253 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11254 // FMOV X0, D0 // copy result back to integer reg
11255 if (VT == MVT::i32 || VT == MVT::i64) {
11256 if (VT == MVT::i32)
11257 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11258 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11259
11260 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11261 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11262 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11263 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11264 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11265 DAG.getConstant(0, DL, MVT::i64));
11266 if (IsParity)
11267 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11268 return AddV;
11269 } else if (VT == MVT::i128) {
11270 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11271
11272 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11273 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11274 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11275 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11276 DAG.getConstant(0, DL, MVT::i64));
11277 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11278 if (IsParity)
11279 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11280 return AddV;
11281 }
11282
11283 assert(!IsParity && "ISD::PARITY of vector types not supported");
11284
11285 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11286 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11287 "Unexpected type for custom ctpop lowering");
11288
11289 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11290 Val = DAG.getBitcast(VT8Bit, Val);
11291 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11292
11293 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11294 VT.getVectorNumElements() >= 2) {
11295 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11296 SDValue Zeros = DAG.getConstant(0, DL, DT);
11297 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11298
11299 if (VT == MVT::v2i64) {
11300 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11301 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11302 } else if (VT == MVT::v2i32) {
11303 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11304 } else if (VT == MVT::v4i32) {
11305 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11306 } else {
11307 llvm_unreachable("Unexpected type for custom ctpop lowering");
11308 }
11309
11310 return Val;
11311 }
11312
11313 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11314 unsigned EltSize = 8;
11315 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11316 while (EltSize != VT.getScalarSizeInBits()) {
11317 EltSize *= 2;
11318 NumElts /= 2;
11319 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11320 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11321 }
11322
11323 return Val;
11324}
11325
11326SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11327 EVT VT = Op.getValueType();
11328 assert(VT.isScalableVector() ||
11330 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11331
11332 SDLoc DL(Op);
11333 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11334 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11335}
11336
11337SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11338 SelectionDAG &DAG) const {
11339
11340 EVT VT = Op.getValueType();
11341 SDLoc DL(Op);
11342 unsigned Opcode = Op.getOpcode();
11343 ISD::CondCode CC;
11344 switch (Opcode) {
11345 default:
11346 llvm_unreachable("Wrong instruction");
11347 case ISD::SMAX:
11348 CC = ISD::SETGT;
11349 break;
11350 case ISD::SMIN:
11351 CC = ISD::SETLT;
11352 break;
11353 case ISD::UMAX:
11354 CC = ISD::SETUGT;
11355 break;
11356 case ISD::UMIN:
11357 CC = ISD::SETULT;
11358 break;
11359 }
11360
11361 if (VT.isScalableVector() ||
11363 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11364 switch (Opcode) {
11365 default:
11366 llvm_unreachable("Wrong instruction");
11367 case ISD::SMAX:
11368 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11369 case ISD::SMIN:
11370 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11371 case ISD::UMAX:
11372 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11373 case ISD::UMIN:
11374 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11375 }
11376 }
11377
11378 SDValue Op0 = Op.getOperand(0);
11379 SDValue Op1 = Op.getOperand(1);
11380 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11381 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11382}
11383
11384SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11385 SelectionDAG &DAG) const {
11386 EVT VT = Op.getValueType();
11387
11388 if (VT.isScalableVector() ||
11390 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11391 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11392
11393 SDLoc DL(Op);
11394 SDValue REVB;
11395 MVT VST;
11396
11397 switch (VT.getSimpleVT().SimpleTy) {
11398 default:
11399 llvm_unreachable("Invalid type for bitreverse!");
11400
11401 case MVT::v2i32: {
11402 VST = MVT::v8i8;
11403 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11404
11405 break;
11406 }
11407
11408 case MVT::v4i32: {
11409 VST = MVT::v16i8;
11410 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11411
11412 break;
11413 }
11414
11415 case MVT::v1i64: {
11416 VST = MVT::v8i8;
11417 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11418
11419 break;
11420 }
11421
11422 case MVT::v2i64: {
11423 VST = MVT::v16i8;
11424 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11425
11426 break;
11427 }
11428 }
11429
11430 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11431 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11432}
11433
11434// Check whether the continuous comparison sequence.
11435static bool
11436isOrXorChain(SDValue N, unsigned &Num,
11437 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11438 if (Num == MaxXors)
11439 return false;
11440
11441 // Skip the one-use zext
11442 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11443 N = N->getOperand(0);
11444
11445 // The leaf node must be XOR
11446 if (N->getOpcode() == ISD::XOR) {
11447 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11448 Num++;
11449 return true;
11450 }
11451
11452 // All the non-leaf nodes must be OR.
11453 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11454 return false;
11455
11456 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11457 isOrXorChain(N->getOperand(1), Num, WorkList))
11458 return true;
11459 return false;
11460}
11461
11462// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11464 SDValue LHS = N->getOperand(0);
11465 SDValue RHS = N->getOperand(1);
11466 SDLoc DL(N);
11467 EVT VT = N->getValueType(0);
11469
11470 // Only handle integer compares.
11471 if (N->getOpcode() != ISD::SETCC)
11472 return SDValue();
11473
11474 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11475 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11476 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11477 unsigned NumXors = 0;
11478 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11479 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11480 isOrXorChain(LHS, NumXors, WorkList)) {
11481 SDValue XOR0, XOR1;
11482 std::tie(XOR0, XOR1) = WorkList[0];
11483 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11484 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11485 for (unsigned I = 1; I < WorkList.size(); I++) {
11486 std::tie(XOR0, XOR1) = WorkList[I];
11487 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11488 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11489 }
11490
11491 // Exit early by inverting the condition, which help reduce indentations.
11492 return Cmp;
11493 }
11494
11495 return SDValue();
11496}
11497
11498SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11499
11500 if (Op.getValueType().isVector())
11501 return LowerVSETCC(Op, DAG);
11502
11503 bool IsStrict = Op->isStrictFPOpcode();
11504 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11505 unsigned OpNo = IsStrict ? 1 : 0;
11506 SDValue Chain;
11507 if (IsStrict)
11508 Chain = Op.getOperand(0);
11509 SDValue LHS = Op.getOperand(OpNo + 0);
11510 SDValue RHS = Op.getOperand(OpNo + 1);
11511 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11512 SDLoc DL(Op);
11513
11514 // We chose ZeroOrOneBooleanContents, so use zero and one.
11515 EVT VT = Op.getValueType();
11516 SDValue TVal = DAG.getConstant(1, DL, VT);
11517 SDValue FVal = DAG.getConstant(0, DL, VT);
11518
11519 // Handle f128 first, since one possible outcome is a normal integer
11520 // comparison which gets picked up by the next if statement.
11521 if (LHS.getValueType() == MVT::f128) {
11522 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11523 IsSignaling);
11524
11525 // If softenSetCCOperands returned a scalar, use it.
11526 if (!RHS.getNode()) {
11527 assert(LHS.getValueType() == Op.getValueType() &&
11528 "Unexpected setcc expansion!");
11529 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11530 }
11531 }
11532
11533 if (LHS.getValueType().isInteger()) {
11534
11535 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11536
11537 SDValue CCVal;
11539 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11540
11541 // Note that we inverted the condition above, so we reverse the order of
11542 // the true and false operands here. This will allow the setcc to be
11543 // matched to a single CSINC instruction.
11544 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11545 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11546 }
11547
11548 // Now we know we're dealing with FP values.
11549 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11550 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11551
11552 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11553 // and do the comparison.
11554 SDValue Cmp;
11555 if (IsStrict)
11556 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11557 else
11558 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11559
11560 AArch64CC::CondCode CC1, CC2;
11561 changeFPCCToAArch64CC(CC, CC1, CC2);
11562 SDValue Res;
11563 if (CC2 == AArch64CC::AL) {
11564 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11565 CC2);
11566 SDValue CC1Val = getCondCode(DAG, CC1);
11567
11568 // Note that we inverted the condition above, so we reverse the order of
11569 // the true and false operands here. This will allow the setcc to be
11570 // matched to a single CSINC instruction.
11571 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11572 } else {
11573 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11574 // totally clean. Some of them require two CSELs to implement. As is in
11575 // this case, we emit the first CSEL and then emit a second using the output
11576 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11577
11578 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11579 SDValue CC1Val = getCondCode(DAG, CC1);
11580 SDValue CS1 =
11581 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11582
11583 SDValue CC2Val = getCondCode(DAG, CC2);
11584 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11585 }
11586 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11587}
11588
11589SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11590 SelectionDAG &DAG) const {
11591
11592 SDValue LHS = Op.getOperand(0);
11593 SDValue RHS = Op.getOperand(1);
11594 EVT VT = LHS.getValueType();
11595 if (VT != MVT::i32 && VT != MVT::i64)
11596 return SDValue();
11597
11598 SDLoc DL(Op);
11599 SDValue Carry = Op.getOperand(2);
11600 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11601 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11602 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11603 LHS, RHS, InvCarry);
11604
11605 EVT OpVT = Op.getValueType();
11606 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11607 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11608
11609 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11611 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11612 // Inputs are swapped because the condition is inverted. This will allow
11613 // matching with a single CSINC instruction.
11614 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11615 Cmp.getValue(1));
11616}
11617
11618/// Emit vector comparison for floating-point values, producing a mask.
11620 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11621 const SDLoc &DL, SelectionDAG &DAG) {
11622 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11623 "function only supposed to emit natural comparisons");
11624
11625 switch (CC) {
11626 default:
11627 return SDValue();
11628 case AArch64CC::NE: {
11629 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11630 // Use vector semantics for the inversion to potentially save a copy between
11631 // SIMD and regular registers.
11632 if (!LHS.getValueType().isVector()) {
11633 EVT VecVT =
11634 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11635 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11636 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11637 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11638 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11639 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11640 }
11641 return DAG.getNOT(DL, Fcmeq, VT);
11642 }
11643 case AArch64CC::EQ:
11644 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11645 case AArch64CC::GE:
11646 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11647 case AArch64CC::GT:
11648 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11649 case AArch64CC::LE:
11650 if (!NoNans)
11651 return SDValue();
11652 // If we ignore NaNs then we can use to the LS implementation.
11653 [[fallthrough]];
11654 case AArch64CC::LS:
11655 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11656 case AArch64CC::LT:
11657 if (!NoNans)
11658 return SDValue();
11659 // If we ignore NaNs then we can use to the MI implementation.
11660 [[fallthrough]];
11661 case AArch64CC::MI:
11662 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11663 }
11664}
11665
11666/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11667/// values are scalars, try to emit a mask generating vector instruction.
11669 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11670 const SDLoc &DL, SelectionDAG &DAG) {
11671 assert(!LHS.getValueType().isVector());
11672 assert(!RHS.getValueType().isVector());
11673
11674 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11675 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11676 if (!CTVal || !CFVal)
11677 return {};
11678 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11679 !(CTVal->isZero() && CFVal->isAllOnes()))
11680 return {};
11681
11682 if (CTVal->isZero())
11683 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11684
11685 EVT VT = TVal.getValueType();
11686 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11687 return {};
11688
11689 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11690 bool OneNaN = false;
11691 if (LHS == RHS) {
11692 OneNaN = true;
11693 } else if (DAG.isKnownNeverNaN(RHS)) {
11694 OneNaN = true;
11695 RHS = LHS;
11696 } else if (DAG.isKnownNeverNaN(LHS)) {
11697 OneNaN = true;
11698 LHS = RHS;
11699 }
11700 if (OneNaN)
11701 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11702 }
11703
11706 bool ShouldInvert = false;
11707 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11708 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11709 SDValue Cmp2;
11710 if (CC2 != AArch64CC::AL) {
11711 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11712 if (!Cmp2)
11713 return {};
11714 }
11715 if (!Cmp2 && !ShouldInvert)
11716 return Cmp;
11717
11718 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11719 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11720 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11721 Zero);
11722 if (Cmp2) {
11723 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11724 Cmp2, Zero);
11725 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11726 }
11727 if (ShouldInvert)
11728 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11729 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11730 return Cmp;
11731}
11732
11733SDValue AArch64TargetLowering::LowerSELECT_CC(
11736 const SDLoc &DL, SelectionDAG &DAG) const {
11737 // Handle f128 first, because it will result in a comparison of some RTLIB
11738 // call result against zero.
11739 if (LHS.getValueType() == MVT::f128) {
11740 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11741
11742 // If softenSetCCOperands returned a scalar, we need to compare the result
11743 // against zero to select between true and false values.
11744 if (!RHS.getNode()) {
11745 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11746 CC = ISD::SETNE;
11747 }
11748 }
11749
11750 // Also handle f16, for which we need to do a f32 comparison.
11751 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11752 LHS.getValueType() == MVT::bf16) {
11753 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11754 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11755 }
11756
11757 // Next, handle integers.
11758 if (LHS.getValueType().isInteger()) {
11759 assert((LHS.getValueType() == RHS.getValueType()) &&
11760 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11761
11762 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11763 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11764 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11765
11766 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11767 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11768 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11769 // Both require less instructions than compare and conditional select.
11770 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11771 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11772 LHS.getValueType() == RHS.getValueType()) {
11773 EVT VT = LHS.getValueType();
11774 SDValue Shift =
11775 DAG.getNode(ISD::SRA, DL, VT, LHS,
11776 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11777
11778 if (CC == ISD::SETGT)
11779 Shift = DAG.getNOT(DL, Shift, VT);
11780
11781 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11782 }
11783
11784 // Check for sign bit test patterns that can use TST optimization.
11785 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
11786 // -> TST %operand, sign_bit; CSEL
11787 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
11788 // -> TST %operand, sign_bit; CSEL
11789 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
11790 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
11791 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11792
11793 uint64_t SignBitPos;
11794 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11795 EVT TestVT = LHS.getValueType();
11796 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
11797 SDValue TST =
11798 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
11799 LHS, SignBitConst);
11800
11801 SDValue Flags = TST.getValue(1);
11802 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
11803 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
11804 }
11805
11806 // Canonicalise absolute difference patterns:
11807 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11808 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11809 //
11810 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11811 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11812 // The second forms can be matched into subs+cneg.
11813 // NOTE: Drop poison generating flags from the negated operand to avoid
11814 // inadvertently propagating poison after the canonicalisation.
11815 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11816 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11817 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11819 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11820 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11821 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11823 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11824 }
11825 }
11826
11827 unsigned Opcode = AArch64ISD::CSEL;
11828
11829 // If both the TVal and the FVal are constants, see if we can swap them in
11830 // order to for a CSINV or CSINC out of them.
11831 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11832 std::swap(TVal, FVal);
11833 std::swap(CTVal, CFVal);
11834 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11835 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11836 std::swap(TVal, FVal);
11837 std::swap(CTVal, CFVal);
11838 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11839 } else if (TVal.getOpcode() == ISD::XOR) {
11840 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11841 // with a CSINV rather than a CSEL.
11842 if (isAllOnesConstant(TVal.getOperand(1))) {
11843 std::swap(TVal, FVal);
11844 std::swap(CTVal, CFVal);
11845 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11846 }
11847 } else if (TVal.getOpcode() == ISD::SUB) {
11848 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11849 // that we can match with a CSNEG rather than a CSEL.
11850 if (isNullConstant(TVal.getOperand(0))) {
11851 std::swap(TVal, FVal);
11852 std::swap(CTVal, CFVal);
11853 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11854 }
11855 } else if (CTVal && CFVal) {
11856 const int64_t TrueVal = CTVal->getSExtValue();
11857 const int64_t FalseVal = CFVal->getSExtValue();
11858 bool Swap = false;
11859
11860 // If both TVal and FVal are constants, see if FVal is the
11861 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11862 // instead of a CSEL in that case.
11863 if (TrueVal == ~FalseVal) {
11864 Opcode = AArch64ISD::CSINV;
11865 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11866 TrueVal == -FalseVal) {
11867 Opcode = AArch64ISD::CSNEG;
11868 } else if (TVal.getValueType() == MVT::i32) {
11869 // If our operands are only 32-bit wide, make sure we use 32-bit
11870 // arithmetic for the check whether we can use CSINC. This ensures that
11871 // the addition in the check will wrap around properly in case there is
11872 // an overflow (which would not be the case if we do the check with
11873 // 64-bit arithmetic).
11874 const uint32_t TrueVal32 = CTVal->getZExtValue();
11875 const uint32_t FalseVal32 = CFVal->getZExtValue();
11876
11877 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11878 Opcode = AArch64ISD::CSINC;
11879
11880 if (TrueVal32 > FalseVal32) {
11881 Swap = true;
11882 }
11883 }
11884 } else {
11885 // 64-bit check whether we can use CSINC.
11886 const uint64_t TrueVal64 = TrueVal;
11887 const uint64_t FalseVal64 = FalseVal;
11888
11889 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11890 Opcode = AArch64ISD::CSINC;
11891
11892 if (TrueVal > FalseVal) {
11893 Swap = true;
11894 }
11895 }
11896 }
11897
11898 // Swap TVal and FVal if necessary.
11899 if (Swap) {
11900 std::swap(TVal, FVal);
11901 std::swap(CTVal, CFVal);
11902 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11903 }
11904
11905 if (Opcode != AArch64ISD::CSEL) {
11906 // Drop FVal since we can get its value by simply inverting/negating
11907 // TVal.
11908 FVal = TVal;
11909 }
11910 }
11911
11912 // Avoid materializing a constant when possible by reusing a known value in
11913 // a register. However, don't perform this optimization if the known value
11914 // is one, zero or negative one in the case of a CSEL. We can always
11915 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11916 // FVal, respectively.
11917 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11918 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11919 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11921 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11922 // "a != C ? x : a" to avoid materializing C.
11923 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11924 TVal = LHS;
11925 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11926 FVal = LHS;
11927 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11928 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11929 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11930 // avoid materializing C.
11932 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11933 Opcode = AArch64ISD::CSINV;
11934 TVal = LHS;
11935 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11936 }
11937 }
11938
11939 SDValue CCVal;
11940 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11941 EVT VT = TVal.getValueType();
11942 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11943 }
11944
11945 // Now we know we're dealing with FP values.
11946 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11947 LHS.getValueType() == MVT::f64);
11948 assert(LHS.getValueType() == RHS.getValueType());
11949 EVT VT = TVal.getValueType();
11950
11951 // If the purpose of the comparison is to select between all ones
11952 // or all zeros, try to use a vector comparison because the operands are
11953 // already stored in SIMD registers.
11954 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11955 switch (U->getOpcode()) {
11956 default:
11957 return false;
11960 case AArch64ISD::DUP:
11961 return true;
11962 }
11963 })) {
11964 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11965 SDValue VectorCmp =
11966 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11967 if (VectorCmp)
11968 return VectorCmp;
11969 }
11970
11971 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11972
11973 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11974 // clean. Some of them require two CSELs to implement.
11975 AArch64CC::CondCode CC1, CC2;
11976 changeFPCCToAArch64CC(CC, CC1, CC2);
11977
11978 if (Flags.hasNoSignedZeros()) {
11979 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11980 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11981 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11982 if (RHSVal && RHSVal->isZero()) {
11983 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11984 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11985
11986 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11987 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11988 TVal = LHS;
11989 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11990 CFVal && CFVal->isZero() &&
11991 FVal.getValueType() == LHS.getValueType())
11992 FVal = LHS;
11993 }
11994 }
11995
11996 // Emit first, and possibly only, CSEL.
11997 SDValue CC1Val = getCondCode(DAG, CC1);
11998 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11999
12000 // If we need a second CSEL, emit it, using the output of the first as the
12001 // RHS. We're effectively OR'ing the two CC's together.
12002 if (CC2 != AArch64CC::AL) {
12003 SDValue CC2Val = getCondCode(DAG, CC2);
12004 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12005 }
12006
12007 // Otherwise, return the output of the first CSEL.
12008 return CS1;
12009}
12010
12011SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12012 SelectionDAG &DAG) const {
12013 EVT Ty = Op.getValueType();
12014 auto Idx = Op.getConstantOperandAPInt(2);
12015 int64_t IdxVal = Idx.getSExtValue();
12016 assert(Ty.isScalableVector() &&
12017 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12018
12019 // We can use the splice instruction for certain index values where we are
12020 // able to efficiently generate the correct predicate. The index will be
12021 // inverted and used directly as the input to the ptrue instruction, i.e.
12022 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12023 // splice predicate. However, we can only do this if we can guarantee that
12024 // there are enough elements in the vector, hence we check the index <= min
12025 // number of elements.
12026 std::optional<unsigned> PredPattern;
12027 if (Ty.isScalableVector() && IdxVal < 0 &&
12028 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
12029 std::nullopt) {
12030 SDLoc DL(Op);
12031
12032 // Create a predicate where all but the last -IdxVal elements are false.
12033 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
12034 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12035 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12036
12037 // Now splice the two inputs together using the predicate.
12038 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12039 Op.getOperand(1));
12040 }
12041
12042 // We can select to an EXT instruction when indexing the first 256 bytes.
12044 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
12045 return Op;
12046
12047 return SDValue();
12048}
12049
12050SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12051 SelectionDAG &DAG) const {
12052 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12053 SDValue LHS = Op.getOperand(0);
12054 SDValue RHS = Op.getOperand(1);
12055 SDValue TVal = Op.getOperand(2);
12056 SDValue FVal = Op.getOperand(3);
12057 SDNodeFlags Flags = Op->getFlags();
12058 SDLoc DL(Op);
12059 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12060}
12061
12062SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12063 SelectionDAG &DAG) const {
12064 SDValue CCVal = Op->getOperand(0);
12065 SDValue TVal = Op->getOperand(1);
12066 SDValue FVal = Op->getOperand(2);
12067 SDLoc DL(Op);
12068
12069 EVT Ty = Op.getValueType();
12070 if (Ty == MVT::aarch64svcount) {
12071 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12072 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12073 SDValue Sel =
12074 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12075 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12076 }
12077
12078 if (Ty.isScalableVector()) {
12079 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12080 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12081 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12082 }
12083
12084 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12085 // FIXME: Ideally this would be the same as above using i1 types, however
12086 // for the moment we can't deal with fixed i1 vector types properly, so
12087 // instead extend the predicate to a result type sized integer vector.
12088 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12089 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12090 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12091 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12092 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12093 }
12094
12095 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12096 // instruction.
12097 if (ISD::isOverflowIntrOpRes(CCVal)) {
12098 // Only lower legal XALUO ops.
12099 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12100 return SDValue();
12101
12103 SDValue Value, Overflow;
12104 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12105 SDValue CCVal = getCondCode(DAG, OFCC);
12106
12107 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12108 CCVal, Overflow);
12109 }
12110
12111 // Lower it the same way as we would lower a SELECT_CC node.
12112 ISD::CondCode CC;
12113 SDValue LHS, RHS;
12114 if (CCVal.getOpcode() == ISD::SETCC) {
12115 LHS = CCVal.getOperand(0);
12116 RHS = CCVal.getOperand(1);
12117 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12118 } else {
12119 LHS = CCVal;
12120 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12121 CC = ISD::SETNE;
12122 }
12123
12124 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12125 // order to use FCSELSrrr
12126 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12127 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12128 DAG.getUNDEF(MVT::f32), TVal);
12129 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12130 DAG.getUNDEF(MVT::f32), FVal);
12131 }
12132
12133 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12134 Op->getFlags(), DL, DAG);
12135
12136 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12137 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12138 }
12139
12140 return Res;
12141}
12142
12143SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12144 SelectionDAG &DAG) const {
12145 // Jump table entries as PC relative offsets. No additional tweaking
12146 // is necessary here. Just get the address of the jump table.
12147 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12148
12151 !Subtarget->isTargetMachO())
12152 return getAddrLarge(JT, DAG);
12153 if (CM == CodeModel::Tiny)
12154 return getAddrTiny(JT, DAG);
12155 return getAddr(JT, DAG);
12156}
12157
12158SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12159 SelectionDAG &DAG) const {
12160 // Jump table entries as PC relative offsets. No additional tweaking
12161 // is necessary here. Just get the address of the jump table.
12162 SDLoc DL(Op);
12163 SDValue JT = Op.getOperand(1);
12164 SDValue Entry = Op.getOperand(2);
12165 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12166
12167 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12168 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12169
12170 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12171 // sequence later, to guarantee the integrity of the intermediate values.
12173 "aarch64-jump-table-hardening")) {
12175 if (Subtarget->isTargetMachO()) {
12176 if (CM != CodeModel::Small && CM != CodeModel::Large)
12177 report_fatal_error("Unsupported code-model for hardened jump-table");
12178 } else {
12179 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12180 assert(Subtarget->isTargetELF() &&
12181 "jump table hardening only supported on MachO/ELF");
12182 if (CM != CodeModel::Small)
12183 report_fatal_error("Unsupported code-model for hardened jump-table");
12184 }
12185
12186 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12187 Entry, SDValue());
12188 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12189 DAG.getTargetJumpTable(JTI, MVT::i32),
12190 X16Copy.getValue(0), X16Copy.getValue(1));
12191 return SDValue(B, 0);
12192 }
12193
12194 SDNode *Dest =
12195 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12196 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12197 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12198 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12199}
12200
12201SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12202 SDValue Chain = Op.getOperand(0);
12203 SDValue Dest = Op.getOperand(1);
12204
12205 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12206 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12207 if (Dest->isMachineOpcode() &&
12208 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12209 return SDValue();
12210
12211 const MachineFunction &MF = DAG.getMachineFunction();
12212 std::optional<uint16_t> BADisc =
12213 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12214 if (!BADisc)
12215 return SDValue();
12216
12217 SDLoc DL(Op);
12218
12219 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12221 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12222
12223 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12224 {Dest, Key, Disc, AddrDisc, Chain});
12225 return SDValue(BrA, 0);
12226}
12227
12228SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12229 SelectionDAG &DAG) const {
12230 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12232 if (CM == CodeModel::Large) {
12233 // Use the GOT for the large code model on iOS.
12234 if (Subtarget->isTargetMachO()) {
12235 return getGOT(CP, DAG);
12236 }
12238 return getAddrLarge(CP, DAG);
12239 } else if (CM == CodeModel::Tiny) {
12240 return getAddrTiny(CP, DAG);
12241 }
12242 return getAddr(CP, DAG);
12243}
12244
12245SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12246 SelectionDAG &DAG) const {
12247 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12248 const BlockAddress *BA = BAN->getBlockAddress();
12249
12250 if (std::optional<uint16_t> BADisc =
12251 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12252 *BA->getFunction())) {
12253 SDLoc DL(Op);
12254
12255 // This isn't cheap, but BRIND is rare.
12256 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12257
12258 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12259
12261 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12262
12263 SDNode *MOV =
12264 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12265 {TargetBA, Key, AddrDisc, Disc});
12266 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12267 SDValue(MOV, 1));
12268 }
12269
12271 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12273 return getAddrLarge(BAN, DAG);
12274 } else if (CM == CodeModel::Tiny) {
12275 return getAddrTiny(BAN, DAG);
12276 }
12277 return getAddr(BAN, DAG);
12278}
12279
12280SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12281 SelectionDAG &DAG) const {
12282 AArch64FunctionInfo *FuncInfo =
12283 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12284
12285 SDLoc DL(Op);
12286 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12288 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12289 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12290 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12291 MachinePointerInfo(SV));
12292}
12293
12294SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12295 SelectionDAG &DAG) const {
12296 MachineFunction &MF = DAG.getMachineFunction();
12297 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12298
12299 SDLoc DL(Op);
12300 SDValue FR;
12301 if (Subtarget->isWindowsArm64EC()) {
12302 // With the Arm64EC ABI, we compute the address of the varargs save area
12303 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12304 // but calls from an entry thunk can pass in a different address.
12305 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12306 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12307 uint64_t StackOffset;
12308 if (FuncInfo->getVarArgsGPRSize() > 0)
12309 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12310 else
12311 StackOffset = FuncInfo->getVarArgsStackOffset();
12312 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12313 DAG.getConstant(StackOffset, DL, MVT::i64));
12314 } else {
12315 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12316 ? FuncInfo->getVarArgsGPRIndex()
12317 : FuncInfo->getVarArgsStackIndex(),
12319 }
12320 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12321 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12322 MachinePointerInfo(SV));
12323}
12324
12325SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12326 SelectionDAG &DAG) const {
12327 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12328 // Standard, section B.3.
12329 MachineFunction &MF = DAG.getMachineFunction();
12330 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12331 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12332 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12333 auto PtrVT = getPointerTy(DAG.getDataLayout());
12334 SDLoc DL(Op);
12335
12336 SDValue Chain = Op.getOperand(0);
12337 SDValue VAList = Op.getOperand(1);
12338 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12340
12341 // void *__stack at offset 0
12342 unsigned Offset = 0;
12343 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12344 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12345 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12346 MachinePointerInfo(SV), Align(PtrSize)));
12347
12348 // void *__gr_top at offset 8 (4 on ILP32)
12349 Offset += PtrSize;
12350 int GPRSize = FuncInfo->getVarArgsGPRSize();
12351 if (GPRSize > 0) {
12352 SDValue GRTop, GRTopAddr;
12353
12354 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12355 DAG.getConstant(Offset, DL, PtrVT));
12356
12357 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12358 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12359 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12360 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12361
12362 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12363 MachinePointerInfo(SV, Offset),
12364 Align(PtrSize)));
12365 }
12366
12367 // void *__vr_top at offset 16 (8 on ILP32)
12368 Offset += PtrSize;
12369 int FPRSize = FuncInfo->getVarArgsFPRSize();
12370 if (FPRSize > 0) {
12371 SDValue VRTop, VRTopAddr;
12372 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12373 DAG.getConstant(Offset, DL, PtrVT));
12374
12375 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12376 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12377 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12378 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12379
12380 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12381 MachinePointerInfo(SV, Offset),
12382 Align(PtrSize)));
12383 }
12384
12385 // int __gr_offs at offset 24 (12 on ILP32)
12386 Offset += PtrSize;
12387 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12388 DAG.getConstant(Offset, DL, PtrVT));
12389 MemOps.push_back(
12390 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12391 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12392
12393 // int __vr_offs at offset 28 (16 on ILP32)
12394 Offset += 4;
12395 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12396 DAG.getConstant(Offset, DL, PtrVT));
12397 MemOps.push_back(
12398 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12399 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12400
12401 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12402}
12403
12404SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12405 SelectionDAG &DAG) const {
12406 MachineFunction &MF = DAG.getMachineFunction();
12407 Function &F = MF.getFunction();
12408
12409 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12410 return LowerWin64_VASTART(Op, DAG);
12411 else if (Subtarget->isTargetDarwin())
12412 return LowerDarwin_VASTART(Op, DAG);
12413 else
12414 return LowerAAPCS_VASTART(Op, DAG);
12415}
12416
12417SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12418 SelectionDAG &DAG) const {
12419 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12420 // pointer.
12421 SDLoc DL(Op);
12422 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12423 unsigned VaListSize =
12424 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12425 ? PtrSize
12426 : Subtarget->isTargetILP32() ? 20 : 32;
12427 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12428 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12429
12430 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12431 DAG.getConstant(VaListSize, DL, MVT::i32),
12432 Align(PtrSize), false, false, /*CI=*/nullptr,
12433 std::nullopt, MachinePointerInfo(DestSV),
12434 MachinePointerInfo(SrcSV));
12435}
12436
12437SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12438 assert(Subtarget->isTargetDarwin() &&
12439 "automatic va_arg instruction only works on Darwin");
12440
12441 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12442 EVT VT = Op.getValueType();
12443 SDLoc DL(Op);
12444 SDValue Chain = Op.getOperand(0);
12445 SDValue Addr = Op.getOperand(1);
12446 MaybeAlign Align(Op.getConstantOperandVal(3));
12447 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12448 auto PtrVT = getPointerTy(DAG.getDataLayout());
12449 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12450 SDValue VAList =
12451 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12452 Chain = VAList.getValue(1);
12453 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12454
12455 if (VT.isScalableVector())
12456 report_fatal_error("Passing SVE types to variadic functions is "
12457 "currently not supported");
12458
12459 if (Align && *Align > MinSlotSize) {
12460 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12461 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12462 VAList =
12463 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12464 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12465 }
12466
12467 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12468 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12469
12470 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12471 // up to 64 bits. At the very least, we have to increase the striding of the
12472 // vaargs list to match this, and for FP values we need to introduce
12473 // FP_ROUND nodes as well.
12474 if (VT.isInteger() && !VT.isVector())
12475 ArgSize = std::max(ArgSize, MinSlotSize);
12476 bool NeedFPTrunc = false;
12477 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12478 ArgSize = 8;
12479 NeedFPTrunc = true;
12480 }
12481
12482 // Increment the pointer, VAList, to the next vaarg
12483 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12484 DAG.getConstant(ArgSize, DL, PtrVT));
12485 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12486
12487 // Store the incremented VAList to the legalized pointer
12488 SDValue APStore =
12489 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12490
12491 // Load the actual argument out of the pointer VAList
12492 if (NeedFPTrunc) {
12493 // Load the value as an f64.
12494 SDValue WideFP =
12495 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12496 // Round the value down to an f32.
12497 SDValue NarrowFP =
12498 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12499 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12500 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12501 // Merge the rounded value with the chain output of the load.
12502 return DAG.getMergeValues(Ops, DL);
12503 }
12504
12505 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12506}
12507
12508SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12509 SelectionDAG &DAG) const {
12510 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12511 MFI.setFrameAddressIsTaken(true);
12512
12513 EVT VT = Op.getValueType();
12514 SDLoc DL(Op);
12515 unsigned Depth = Op.getConstantOperandVal(0);
12516 SDValue FrameAddr =
12517 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12518 while (Depth--)
12519 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12520 MachinePointerInfo());
12521
12522 if (Subtarget->isTargetILP32())
12523 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12524 DAG.getValueType(VT));
12525
12526 return FrameAddr;
12527}
12528
12529SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12530 SelectionDAG &DAG) const {
12531 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12532
12533 EVT VT = getPointerTy(DAG.getDataLayout());
12534 int FI = MFI.CreateFixedObject(4, 0, false);
12535 return DAG.getFrameIndex(FI, VT);
12536}
12537
12538#define GET_REGISTER_MATCHER
12539#include "AArch64GenAsmMatcher.inc"
12540
12541// FIXME? Maybe this could be a TableGen attribute on some registers and
12542// this table could be generated automatically from RegInfo.
12543Register AArch64TargetLowering::
12544getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12546 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12547 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12548 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12549 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12550 !MRI->isReservedReg(MF, Reg))
12551 Reg = Register();
12552 }
12553 return Reg;
12554}
12555
12556SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12557 SelectionDAG &DAG) const {
12559
12560 EVT VT = Op.getValueType();
12561 SDLoc DL(Op);
12562
12563 SDValue FrameAddr =
12564 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12566
12567 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12568}
12569
12570SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12571 SelectionDAG &DAG) const {
12572 MachineFunction &MF = DAG.getMachineFunction();
12573 MachineFrameInfo &MFI = MF.getFrameInfo();
12574 MFI.setReturnAddressIsTaken(true);
12575
12576 EVT VT = Op.getValueType();
12577 SDLoc DL(Op);
12578 unsigned Depth = Op.getConstantOperandVal(0);
12579 SDValue ReturnAddress;
12580 if (Depth) {
12581 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12583 ReturnAddress = DAG.getLoad(
12584 VT, DL, DAG.getEntryNode(),
12585 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12586 } else {
12587 // Return LR, which contains the return address. Mark it an implicit
12588 // live-in.
12589 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12590 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12591 }
12592
12593 // The XPACLRI instruction assembles to a hint-space instruction before
12594 // Armv8.3-A therefore this instruction can be safely used for any pre
12595 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12596 // that instead.
12597 SDNode *St;
12598 if (Subtarget->hasPAuth()) {
12599 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12600 } else {
12601 // XPACLRI operates on LR therefore we must move the operand accordingly.
12602 SDValue Chain =
12603 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12604 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12605 }
12606 return SDValue(St, 0);
12607}
12608
12609/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12610/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12611SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12612 SelectionDAG &DAG) const {
12613 SDValue Lo, Hi;
12614 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12615 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12616}
12617
12619 const GlobalAddressSDNode *GA) const {
12620 // Offsets are folded in the DAG combine rather than here so that we can
12621 // intelligently choose an offset based on the uses.
12622 return false;
12623}
12624
12626 bool OptForSize) const {
12627 bool IsLegal = false;
12628 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12629 // 16-bit case when target has full fp16 support.
12630 // We encode bf16 bit patterns as if they were fp16. This results in very
12631 // strange looking assembly but should populate the register with appropriate
12632 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12633 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12634 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12635 // FIXME: We should be able to handle f128 as well with a clever lowering.
12636 const APInt ImmInt = Imm.bitcastToAPInt();
12637 if (VT == MVT::f64)
12638 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12639 else if (VT == MVT::f32)
12640 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12641 else if (VT == MVT::f16 || VT == MVT::bf16)
12642 IsLegal =
12643 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12644 Imm.isPosZero();
12645
12646 // If we can not materialize in immediate field for fmov, check if the
12647 // value can be encoded as the immediate operand of a logical instruction.
12648 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12649 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12650 // generate that fmov.
12651 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12652 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12653 // however the mov+fmov sequence is always better because of the reduced
12654 // cache pressure. The timings are still the same if you consider
12655 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12656 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12659 assert(Insn.size() <= 4 &&
12660 "Should be able to build any value with at most 4 moves");
12661 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12662 IsLegal = Insn.size() <= Limit;
12663 }
12664
12665 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12666 << " imm value: "; Imm.dump(););
12667 return IsLegal;
12668}
12669
12670//===----------------------------------------------------------------------===//
12671// AArch64 Optimization Hooks
12672//===----------------------------------------------------------------------===//
12673
12674static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12675 SDValue Operand, SelectionDAG &DAG,
12676 int &ExtraSteps) {
12677 EVT VT = Operand.getValueType();
12678 if ((ST->hasNEON() &&
12679 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12680 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12681 VT == MVT::v4f32)) ||
12682 (ST->hasSVE() &&
12683 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12685 // For the reciprocal estimates, convergence is quadratic, so the number
12686 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12687 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12688 // the result for float (23 mantissa bits) is 2 and for double (52
12689 // mantissa bits) is 3.
12690 constexpr unsigned AccurateBits = 8;
12691 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12692 ExtraSteps = DesiredBits <= AccurateBits
12693 ? 0
12694 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12695 }
12696
12697 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12698 }
12699
12700 return SDValue();
12701}
12702
12703SDValue
12704AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12705 const DenormalMode &Mode) const {
12706 SDLoc DL(Op);
12707 EVT VT = Op.getValueType();
12708 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12709 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12710 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12711}
12712
12713SDValue
12714AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12715 SelectionDAG &DAG) const {
12716 return Op;
12717}
12718
12719SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12720 SelectionDAG &DAG, int Enabled,
12721 int &ExtraSteps,
12722 bool &UseOneConst,
12723 bool Reciprocal) const {
12725 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12726 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12727 DAG, ExtraSteps)) {
12728 SDLoc DL(Operand);
12729 EVT VT = Operand.getValueType();
12730
12731 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12732 SDNodeFlags Flags =
12734
12735 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12736 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12737 for (int i = ExtraSteps; i > 0; --i) {
12738 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12739 Flags);
12740 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12741 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12742 }
12743 if (!Reciprocal)
12744 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12745
12746 ExtraSteps = 0;
12747 return Estimate;
12748 }
12749
12750 return SDValue();
12751}
12752
12753SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12754 SelectionDAG &DAG, int Enabled,
12755 int &ExtraSteps) const {
12757 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12758 DAG, ExtraSteps)) {
12759 SDLoc DL(Operand);
12760 EVT VT = Operand.getValueType();
12761
12763
12764 // Newton reciprocal iteration: E * (2 - X * E)
12765 // AArch64 reciprocal iteration instruction: (2 - M * N)
12766 for (int i = ExtraSteps; i > 0; --i) {
12767 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12768 Estimate, Flags);
12769 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12770 }
12771
12772 ExtraSteps = 0;
12773 return Estimate;
12774 }
12775
12776 return SDValue();
12777}
12778
12779//===----------------------------------------------------------------------===//
12780// AArch64 Inline Assembly Support
12781//===----------------------------------------------------------------------===//
12782
12783// Table of Constraints
12784// TODO: This is the current set of constraints supported by ARM for the
12785// compiler, not all of them may make sense.
12786//
12787// r - A general register
12788// w - An FP/SIMD register of some size in the range v0-v31
12789// x - An FP/SIMD register of some size in the range v0-v15
12790// I - Constant that can be used with an ADD instruction
12791// J - Constant that can be used with a SUB instruction
12792// K - Constant that can be used with a 32-bit logical instruction
12793// L - Constant that can be used with a 64-bit logical instruction
12794// M - Constant that can be used as a 32-bit MOV immediate
12795// N - Constant that can be used as a 64-bit MOV immediate
12796// Q - A memory reference with base register and no offset
12797// S - A symbolic address
12798// Y - Floating point constant zero
12799// Z - Integer constant zero
12800//
12801// Note that general register operands will be output using their 64-bit x
12802// register name, whatever the size of the variable, unless the asm operand
12803// is prefixed by the %w modifier. Floating-point and SIMD register operands
12804// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12805// %q modifier.
12806const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12807 // At this point, we have to lower this constraint to something else, so we
12808 // lower it to an "r" or "w". However, by doing this we will force the result
12809 // to be in register, while the X constraint is much more permissive.
12810 //
12811 // Although we are correct (we are free to emit anything, without
12812 // constraints), we might break use cases that would expect us to be more
12813 // efficient and emit something else.
12814 if (!Subtarget->hasFPARMv8())
12815 return "r";
12816
12817 if (ConstraintVT.isFloatingPoint())
12818 return "w";
12819
12820 if (ConstraintVT.isVector() &&
12821 (ConstraintVT.getSizeInBits() == 64 ||
12822 ConstraintVT.getSizeInBits() == 128))
12823 return "w";
12824
12825 return "r";
12826}
12827
12829
12830// Returns a {Reg, RegisterClass} tuple if the constraint is
12831// a specific predicate register.
12832//
12833// For some constraint like "{pn3}" the default path in
12834// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12835// suitable register class for this register is "PPRorPNR", after which it
12836// determines that nxv16i1 is an appropriate type for the constraint, which is
12837// not what we want. The code here pre-empts this by matching the register
12838// explicitly.
12839static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12841 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12842 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12843 return std::nullopt;
12844
12845 bool IsPredicate = Constraint[1] == 'p';
12846 Constraint = Constraint.substr(2, Constraint.size() - 3);
12847 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12848 if (IsPredicateAsCount)
12849 Constraint = Constraint.drop_front(1);
12850
12851 unsigned V;
12852 if (Constraint.getAsInteger(10, V) || V > 31)
12853 return std::nullopt;
12854
12855 if (IsPredicateAsCount)
12856 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12857 if (IsPredicate)
12858 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12859 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12860}
12861
12862static std::optional<PredicateConstraint>
12865 .Case("Uph", PredicateConstraint::Uph)
12868 .Default(std::nullopt);
12869}
12870
12871static const TargetRegisterClass *
12873 if (VT != MVT::aarch64svcount &&
12874 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12875 return nullptr;
12876
12877 switch (Constraint) {
12879 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12880 : &AArch64::PPR_p8to15RegClass;
12882 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12883 : &AArch64::PPR_3bRegClass;
12885 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12886 : &AArch64::PPRRegClass;
12887 }
12888
12889 llvm_unreachable("Missing PredicateConstraint!");
12890}
12891
12893
12894static std::optional<ReducedGprConstraint>
12897 .Case("Uci", ReducedGprConstraint::Uci)
12899 .Default(std::nullopt);
12900}
12901
12902static const TargetRegisterClass *
12904 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12905 return nullptr;
12906
12907 switch (Constraint) {
12909 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12911 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12912 }
12913
12914 llvm_unreachable("Missing ReducedGprConstraint!");
12915}
12916
12917// The set of cc code supported is from
12918// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12921 .Case("{@cchi}", AArch64CC::HI)
12922 .Case("{@cccs}", AArch64CC::HS)
12923 .Case("{@cclo}", AArch64CC::LO)
12924 .Case("{@ccls}", AArch64CC::LS)
12925 .Case("{@cccc}", AArch64CC::LO)
12926 .Case("{@cceq}", AArch64CC::EQ)
12927 .Case("{@ccgt}", AArch64CC::GT)
12928 .Case("{@ccge}", AArch64CC::GE)
12929 .Case("{@cclt}", AArch64CC::LT)
12930 .Case("{@ccle}", AArch64CC::LE)
12931 .Case("{@cchs}", AArch64CC::HS)
12932 .Case("{@ccne}", AArch64CC::NE)
12933 .Case("{@ccvc}", AArch64CC::VC)
12934 .Case("{@ccpl}", AArch64CC::PL)
12935 .Case("{@ccvs}", AArch64CC::VS)
12936 .Case("{@ccmi}", AArch64CC::MI)
12938 return Cond;
12939}
12940
12941/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12942/// WZR, invert(<cond>)'.
12944 SelectionDAG &DAG) {
12945 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12946 DAG.getConstant(0, DL, MVT::i32),
12947 DAG.getConstant(0, DL, MVT::i32),
12948 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12949}
12950
12951// Lower @cc flag output via getSETCC.
12952SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12953 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12954 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12955 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12956 if (Cond == AArch64CC::Invalid)
12957 return SDValue();
12958 // The output variable should be a scalar integer.
12959 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12960 OpInfo.ConstraintVT.getSizeInBits() < 8)
12961 report_fatal_error("Flag output operand is of invalid type");
12962
12963 // Get NZCV register. Only update chain when copyfrom is glued.
12964 if (Glue.getNode()) {
12965 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12966 Chain = Glue.getValue(1);
12967 } else
12968 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12969 // Extract CC code.
12970 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12971
12973
12974 // Truncate or ZERO_EXTEND based on value types.
12975 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12976 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12977 else
12978 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12979
12980 return Result;
12981}
12982
12983/// getConstraintType - Given a constraint letter, return the type of
12984/// constraint it is for this target.
12986AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12987 if (Constraint.size() == 1) {
12988 switch (Constraint[0]) {
12989 default:
12990 break;
12991 case 'x':
12992 case 'w':
12993 case 'y':
12994 return C_RegisterClass;
12995 // An address with a single base register. Due to the way we
12996 // currently handle addresses it is the same as 'r'.
12997 case 'Q':
12998 return C_Memory;
12999 case 'I':
13000 case 'J':
13001 case 'K':
13002 case 'L':
13003 case 'M':
13004 case 'N':
13005 case 'Y':
13006 case 'Z':
13007 return C_Immediate;
13008 case 'z':
13009 case 'S': // A symbol or label reference with a constant offset
13010 return C_Other;
13011 }
13012 } else if (parsePredicateConstraint(Constraint))
13013 return C_RegisterClass;
13014 else if (parseReducedGprConstraint(Constraint))
13015 return C_RegisterClass;
13016 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13017 return C_Other;
13018 return TargetLowering::getConstraintType(Constraint);
13019}
13020
13021/// Examine constraint type and operand type and determine a weight value.
13022/// This object must already have been set up with the operand type
13023/// and the current alternative constraint selected.
13025AArch64TargetLowering::getSingleConstraintMatchWeight(
13026 AsmOperandInfo &info, const char *constraint) const {
13028 Value *CallOperandVal = info.CallOperandVal;
13029 // If we don't have a value, we can't do a match,
13030 // but allow it at the lowest weight.
13031 if (!CallOperandVal)
13032 return CW_Default;
13033 Type *type = CallOperandVal->getType();
13034 // Look at the constraint type.
13035 switch (*constraint) {
13036 default:
13038 break;
13039 case 'x':
13040 case 'w':
13041 case 'y':
13042 if (type->isFloatingPointTy() || type->isVectorTy())
13043 weight = CW_Register;
13044 break;
13045 case 'z':
13046 weight = CW_Constant;
13047 break;
13048 case 'U':
13049 if (parsePredicateConstraint(constraint) ||
13050 parseReducedGprConstraint(constraint))
13051 weight = CW_Register;
13052 break;
13053 }
13054 return weight;
13055}
13056
13057std::pair<unsigned, const TargetRegisterClass *>
13058AArch64TargetLowering::getRegForInlineAsmConstraint(
13059 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13060 if (Constraint.size() == 1) {
13061 switch (Constraint[0]) {
13062 case 'r':
13063 if (VT.isScalableVector())
13064 return std::make_pair(0U, nullptr);
13065 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13066 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13067 if (VT.getFixedSizeInBits() == 64)
13068 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13069 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13070 case 'w': {
13071 if (!Subtarget->hasFPARMv8())
13072 break;
13073 if (VT.isScalableVector()) {
13074 if (VT.getVectorElementType() != MVT::i1)
13075 return std::make_pair(0U, &AArch64::ZPRRegClass);
13076 return std::make_pair(0U, nullptr);
13077 }
13078 if (VT == MVT::Other)
13079 break;
13080 uint64_t VTSize = VT.getFixedSizeInBits();
13081 if (VTSize == 16)
13082 return std::make_pair(0U, &AArch64::FPR16RegClass);
13083 if (VTSize == 32)
13084 return std::make_pair(0U, &AArch64::FPR32RegClass);
13085 if (VTSize == 64)
13086 return std::make_pair(0U, &AArch64::FPR64RegClass);
13087 if (VTSize == 128)
13088 return std::make_pair(0U, &AArch64::FPR128RegClass);
13089 break;
13090 }
13091 // The instructions that this constraint is designed for can
13092 // only take 128-bit registers so just use that regclass.
13093 case 'x':
13094 if (!Subtarget->hasFPARMv8())
13095 break;
13096 if (VT.isScalableVector())
13097 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13098 if (VT.getSizeInBits() == 128)
13099 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13100 break;
13101 case 'y':
13102 if (!Subtarget->hasFPARMv8())
13103 break;
13104 if (VT.isScalableVector())
13105 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13106 break;
13107 }
13108 } else {
13109 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13110 // SME functions that are not in streaming mode, should
13111 // still observe clobbers of Z-registers by clobbering
13112 // the lower 128bits of those registers.
13113 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13114 !Subtarget->isSVEorStreamingSVEAvailable())
13115 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13116 &AArch64::FPR128RegClass);
13117 return *P;
13118 }
13119 if (const auto PC = parsePredicateConstraint(Constraint))
13120 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13121 return std::make_pair(0U, RegClass);
13122
13123 if (const auto RGC = parseReducedGprConstraint(Constraint))
13124 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13125 return std::make_pair(0U, RegClass);
13126 }
13127 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13129 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13130
13131 if (Constraint == "{za}") {
13132 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13133 }
13134
13135 if (Constraint == "{zt0}") {
13136 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13137 }
13138
13139 // Use the default implementation in TargetLowering to convert the register
13140 // constraint into a member of a register class.
13141 std::pair<unsigned, const TargetRegisterClass *> Res;
13143
13144 // Not found as a standard register?
13145 if (!Res.second) {
13146 unsigned Size = Constraint.size();
13147 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13148 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13149 int RegNo;
13150 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13151 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13152 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13153 // By default we'll emit v0-v31 for this unless there's a modifier where
13154 // we'll emit the correct register as well.
13155 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13156 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13157 Res.second = &AArch64::FPR64RegClass;
13158 } else {
13159 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13160 Res.second = &AArch64::FPR128RegClass;
13161 }
13162 }
13163 }
13164 }
13165
13166 if (Res.second && !Subtarget->hasFPARMv8() &&
13167 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13168 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13169 return std::make_pair(0U, nullptr);
13170
13171 return Res;
13172}
13173
13175 llvm::Type *Ty,
13176 bool AllowUnknown) const {
13177 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13178 return EVT(MVT::i64x8);
13179
13180 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13181}
13182
13183/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13184/// vector. If it is invalid, don't add anything to Ops.
13185void AArch64TargetLowering::LowerAsmOperandForConstraint(
13186 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13187 SelectionDAG &DAG) const {
13188 SDValue Result;
13189
13190 // Currently only support length 1 constraints.
13191 if (Constraint.size() != 1)
13192 return;
13193
13194 char ConstraintLetter = Constraint[0];
13195 switch (ConstraintLetter) {
13196 default:
13197 break;
13198
13199 // This set of constraints deal with valid constants for various instructions.
13200 // Validate and return a target constant for them if we can.
13201 case 'z': {
13202 // 'z' maps to xzr or wzr so it needs an input of 0.
13203 if (!isNullConstant(Op))
13204 return;
13205
13206 if (Op.getValueType() == MVT::i64)
13207 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13208 else
13209 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13210 break;
13211 }
13212 case 'S':
13213 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13214 // supported for PIC while "s" isn't, making "s" less useful. We implement
13215 // "S" but not "s".
13217 break;
13218
13219 case 'I':
13220 case 'J':
13221 case 'K':
13222 case 'L':
13223 case 'M':
13224 case 'N':
13226 if (!C)
13227 return;
13228
13229 // Grab the value and do some validation.
13230 uint64_t CVal = C->getZExtValue();
13231 switch (ConstraintLetter) {
13232 // The I constraint applies only to simple ADD or SUB immediate operands:
13233 // i.e. 0 to 4095 with optional shift by 12
13234 // The J constraint applies only to ADD or SUB immediates that would be
13235 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13236 // instruction [or vice versa], in other words -1 to -4095 with optional
13237 // left shift by 12.
13238 case 'I':
13239 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13240 break;
13241 return;
13242 case 'J': {
13243 uint64_t NVal = -C->getSExtValue();
13244 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13245 CVal = C->getSExtValue();
13246 break;
13247 }
13248 return;
13249 }
13250 // The K and L constraints apply *only* to logical immediates, including
13251 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13252 // been removed and MOV should be used). So these constraints have to
13253 // distinguish between bit patterns that are valid 32-bit or 64-bit
13254 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13255 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13256 // versa.
13257 case 'K':
13258 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13259 break;
13260 return;
13261 case 'L':
13262 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13263 break;
13264 return;
13265 // The M and N constraints are a superset of K and L respectively, for use
13266 // with the MOV (immediate) alias. As well as the logical immediates they
13267 // also match 32 or 64-bit immediates that can be loaded either using a
13268 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13269 // (M) or 64-bit 0x1234000000000000 (N) etc.
13270 // As a note some of this code is liberally stolen from the asm parser.
13271 case 'M': {
13272 if (!isUInt<32>(CVal))
13273 return;
13274 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13275 break;
13276 if ((CVal & 0xFFFF) == CVal)
13277 break;
13278 if ((CVal & 0xFFFF0000ULL) == CVal)
13279 break;
13280 uint64_t NCVal = ~(uint32_t)CVal;
13281 if ((NCVal & 0xFFFFULL) == NCVal)
13282 break;
13283 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13284 break;
13285 return;
13286 }
13287 case 'N': {
13288 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13289 break;
13290 if ((CVal & 0xFFFFULL) == CVal)
13291 break;
13292 if ((CVal & 0xFFFF0000ULL) == CVal)
13293 break;
13294 if ((CVal & 0xFFFF00000000ULL) == CVal)
13295 break;
13296 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13297 break;
13298 uint64_t NCVal = ~CVal;
13299 if ((NCVal & 0xFFFFULL) == NCVal)
13300 break;
13301 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13302 break;
13303 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13304 break;
13305 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13306 break;
13307 return;
13308 }
13309 default:
13310 return;
13311 }
13312
13313 // All assembler immediates are 64-bit integers.
13314 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13315 break;
13316 }
13317
13318 if (Result.getNode()) {
13319 Ops.push_back(Result);
13320 return;
13321 }
13322
13323 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13324}
13325
13326//===----------------------------------------------------------------------===//
13327// AArch64 Advanced SIMD Support
13328//===----------------------------------------------------------------------===//
13329
13330/// WidenVector - Given a value in the V64 register class, produce the
13331/// equivalent value in the V128 register class.
13333 EVT VT = V64Reg.getValueType();
13334 unsigned NarrowSize = VT.getVectorNumElements();
13335 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13336 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13337 SDLoc DL(V64Reg);
13338
13339 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13340 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13341}
13342
13343/// getExtFactor - Determine the adjustment factor for the position when
13344/// generating an "extract from vector registers" instruction.
13345static unsigned getExtFactor(SDValue &V) {
13346 EVT EltType = V.getValueType().getVectorElementType();
13347 return EltType.getSizeInBits() / 8;
13348}
13349
13350// Check if a vector is built from one vector via extracted elements of
13351// another together with an AND mask, ensuring that all elements fit
13352// within range. This can be reconstructed using AND and NEON's TBL1.
13354 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13355 SDLoc DL(Op);
13356 EVT VT = Op.getValueType();
13357 assert(!VT.isScalableVector() &&
13358 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13359
13360 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13361 // directly to TBL1.
13362 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13363 return SDValue();
13364
13365 unsigned NumElts = VT.getVectorNumElements();
13366 assert((NumElts == 8 || NumElts == 16) &&
13367 "Need to have exactly 8 or 16 elements in vector.");
13368
13369 SDValue SourceVec;
13370 SDValue MaskSourceVec;
13371 SmallVector<SDValue, 16> AndMaskConstants;
13372
13373 for (unsigned i = 0; i < NumElts; ++i) {
13374 SDValue V = Op.getOperand(i);
13375 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13376 return SDValue();
13377
13378 SDValue OperandSourceVec = V.getOperand(0);
13379 if (!SourceVec)
13380 SourceVec = OperandSourceVec;
13381 else if (SourceVec != OperandSourceVec)
13382 return SDValue();
13383
13384 // This only looks at shuffles with elements that are
13385 // a) truncated by a constant AND mask extracted from a mask vector, or
13386 // b) extracted directly from a mask vector.
13387 SDValue MaskSource = V.getOperand(1);
13388 if (MaskSource.getOpcode() == ISD::AND) {
13389 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13390 return SDValue();
13391
13392 AndMaskConstants.push_back(MaskSource.getOperand(1));
13393 MaskSource = MaskSource->getOperand(0);
13394 } else if (!AndMaskConstants.empty()) {
13395 // Either all or no operands should have an AND mask.
13396 return SDValue();
13397 }
13398
13399 // An ANY_EXTEND may be inserted between the AND and the source vector
13400 // extraction. We don't care about that, so we can just skip it.
13401 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13402 MaskSource = MaskSource.getOperand(0);
13403
13404 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13405 return SDValue();
13406
13407 SDValue MaskIdx = MaskSource.getOperand(1);
13408 if (!isa<ConstantSDNode>(MaskIdx) ||
13409 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13410 return SDValue();
13411
13412 // We only apply this if all elements come from the same vector with the
13413 // same vector type.
13414 if (!MaskSourceVec) {
13415 MaskSourceVec = MaskSource->getOperand(0);
13416 if (MaskSourceVec.getValueType() != VT)
13417 return SDValue();
13418 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13419 return SDValue();
13420 }
13421 }
13422
13423 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13424 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13425 // insert, we know that the index in the mask must be smaller than the number
13426 // of elements in the source, or we would have an out-of-bounds access.
13427 if (NumElts == 8)
13428 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13429 DAG.getUNDEF(VT));
13430
13431 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13432 if (!AndMaskConstants.empty())
13433 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13434 DAG.getBuildVector(VT, DL, AndMaskConstants));
13435
13436 return DAG.getNode(
13438 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
13439 MaskSourceVec);
13440}
13441
13442// Gather data to see if the operation can be modelled as a
13443// shuffle in combination with VEXTs.
13445 SelectionDAG &DAG) const {
13446 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13447 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13448 SDLoc DL(Op);
13449 EVT VT = Op.getValueType();
13450 assert(!VT.isScalableVector() &&
13451 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13452 unsigned NumElts = VT.getVectorNumElements();
13453
13454 struct ShuffleSourceInfo {
13455 SDValue Vec;
13456 unsigned MinElt;
13457 unsigned MaxElt;
13458
13459 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13460 // be compatible with the shuffle we intend to construct. As a result
13461 // ShuffleVec will be some sliding window into the original Vec.
13462 SDValue ShuffleVec;
13463
13464 // Code should guarantee that element i in Vec starts at element "WindowBase
13465 // + i * WindowScale in ShuffleVec".
13466 int WindowBase;
13467 int WindowScale;
13468
13469 ShuffleSourceInfo(SDValue Vec)
13470 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13471 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13472
13473 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13474 };
13475
13476 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13477 // node.
13479 for (unsigned i = 0; i < NumElts; ++i) {
13480 SDValue V = Op.getOperand(i);
13481 if (V.isUndef())
13482 continue;
13483 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13484 !isa<ConstantSDNode>(V.getOperand(1)) ||
13485 V.getOperand(0).getValueType().isScalableVector()) {
13486 LLVM_DEBUG(
13487 dbgs() << "Reshuffle failed: "
13488 "a shuffle can only come from building a vector from "
13489 "various elements of other fixed-width vectors, provided "
13490 "their indices are constant\n");
13491 return SDValue();
13492 }
13493
13494 // Add this element source to the list if it's not already there.
13495 SDValue SourceVec = V.getOperand(0);
13496 auto Source = find(Sources, SourceVec);
13497 if (Source == Sources.end())
13498 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13499
13500 // Update the minimum and maximum lane number seen.
13501 unsigned EltNo = V.getConstantOperandVal(1);
13502 Source->MinElt = std::min(Source->MinElt, EltNo);
13503 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13504 }
13505
13506 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13507 // better than moving to/from gpr registers for larger vectors.
13508 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13509 // Construct a mask for the tbl. We may need to adjust the index for types
13510 // larger than i8.
13512 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13513 for (unsigned I = 0; I < NumElts; ++I) {
13514 SDValue V = Op.getOperand(I);
13515 if (V.isUndef()) {
13516 for (unsigned OF = 0; OF < OutputFactor; OF++)
13517 Mask.push_back(-1);
13518 continue;
13519 }
13520 // Set the Mask lanes adjusted for the size of the input and output
13521 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13522 // output element, adjusted in their positions per input and output types.
13523 unsigned Lane = V.getConstantOperandVal(1);
13524 for (unsigned S = 0; S < Sources.size(); S++) {
13525 if (V.getOperand(0) == Sources[S].Vec) {
13526 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13527 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13528 for (unsigned OF = 0; OF < OutputFactor; OF++)
13529 Mask.push_back(InputBase + OF);
13530 break;
13531 }
13532 }
13533 }
13534
13535 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13536 // v16i8, and the TBLMask
13537 SmallVector<SDValue, 16> TBLOperands;
13538 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13539 ? Intrinsic::aarch64_neon_tbl3
13540 : Intrinsic::aarch64_neon_tbl4,
13541 DL, MVT::i32));
13542 for (unsigned i = 0; i < Sources.size(); i++) {
13543 SDValue Src = Sources[i].Vec;
13544 EVT SrcVT = Src.getValueType();
13545 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13546 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13547 "Expected a legally typed vector");
13548 if (SrcVT.is64BitVector())
13549 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13550 DAG.getUNDEF(MVT::v8i8));
13551 TBLOperands.push_back(Src);
13552 }
13553
13555 for (unsigned i = 0; i < Mask.size(); i++)
13556 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13557 assert((Mask.size() == 8 || Mask.size() == 16) &&
13558 "Expected a v8i8 or v16i8 Mask");
13559 TBLOperands.push_back(DAG.getBuildVector(
13560 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13561
13562 SDValue Shuffle =
13564 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13565 return DAG.getBitcast(VT, Shuffle);
13566 }
13567
13568 if (Sources.size() > 2) {
13569 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13570 << "sensible when at most two source vectors are "
13571 << "involved\n");
13572 return SDValue();
13573 }
13574
13575 // Find out the smallest element size among result and two sources, and use
13576 // it as element size to build the shuffle_vector.
13577 EVT SmallestEltTy = VT.getVectorElementType();
13578 for (auto &Source : Sources) {
13579 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13580 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13581 SmallestEltTy = SrcEltTy;
13582 }
13583 }
13584 unsigned ResMultiplier =
13585 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13586 uint64_t VTSize = VT.getFixedSizeInBits();
13587 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13588 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13589
13590 // If the source vector is too wide or too narrow, we may nevertheless be able
13591 // to construct a compatible shuffle either by concatenating it with UNDEF or
13592 // extracting a suitable range of elements.
13593 for (auto &Src : Sources) {
13594 EVT SrcVT = Src.ShuffleVec.getValueType();
13595
13596 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13597 if (SrcVTSize == TypeSize::getFixed(VTSize))
13598 continue;
13599
13600 // This stage of the search produces a source with the same element type as
13601 // the original, but with a total width matching the BUILD_VECTOR output.
13602 EVT EltVT = SrcVT.getVectorElementType();
13603 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13604 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13605
13606 if (SrcVTSize.getFixedValue() < VTSize) {
13607 assert(2 * SrcVTSize == VTSize);
13608 // We can pad out the smaller vector for free, so if it's part of a
13609 // shuffle...
13610 Src.ShuffleVec =
13611 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13612 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13613 continue;
13614 }
13615
13616 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13617 LLVM_DEBUG(
13618 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13619 return SDValue();
13620 }
13621
13622 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13623 LLVM_DEBUG(
13624 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13625 return SDValue();
13626 }
13627
13628 if (Src.MinElt >= NumSrcElts) {
13629 // The extraction can just take the second half
13630 Src.ShuffleVec =
13631 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13632 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13633 Src.WindowBase = -NumSrcElts;
13634 } else if (Src.MaxElt < NumSrcElts) {
13635 // The extraction can just take the first half
13636 Src.ShuffleVec =
13637 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13638 DAG.getConstant(0, DL, MVT::i64));
13639 } else {
13640 // An actual VEXT is needed
13641 SDValue VEXTSrc1 =
13642 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13643 DAG.getConstant(0, DL, MVT::i64));
13644 SDValue VEXTSrc2 =
13645 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13646 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13647 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13648
13649 if (!SrcVT.is64BitVector()) {
13650 LLVM_DEBUG(
13651 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13652 "for SVE vectors.");
13653 return SDValue();
13654 }
13655
13656 Src.ShuffleVec =
13657 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13658 DAG.getConstant(Imm, DL, MVT::i32));
13659 Src.WindowBase = -Src.MinElt;
13660 }
13661 }
13662
13663 // Another possible incompatibility occurs from the vector element types. We
13664 // can fix this by bitcasting the source vectors to the same type we intend
13665 // for the shuffle.
13666 for (auto &Src : Sources) {
13667 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13668 if (SrcEltTy == SmallestEltTy)
13669 continue;
13670 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13671 if (DAG.getDataLayout().isBigEndian()) {
13672 Src.ShuffleVec =
13673 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13674 } else {
13675 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13676 }
13677 Src.WindowScale =
13678 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13679 Src.WindowBase *= Src.WindowScale;
13680 }
13681
13682 // Final check before we try to actually produce a shuffle.
13683 LLVM_DEBUG({
13684 for (auto Src : Sources)
13685 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13686 });
13687
13688 // The stars all align, our next step is to produce the mask for the shuffle.
13689 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13690 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13691 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13692 SDValue Entry = Op.getOperand(i);
13693 if (Entry.isUndef())
13694 continue;
13695
13696 auto Src = find(Sources, Entry.getOperand(0));
13697 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13698
13699 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13700 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13701 // segment.
13702 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13703 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13704 VT.getScalarSizeInBits());
13705 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13706
13707 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13708 // starting at the appropriate offset.
13709 int *LaneMask = &Mask[i * ResMultiplier];
13710
13711 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13712 ExtractBase += NumElts * (Src - Sources.begin());
13713 for (int j = 0; j < LanesDefined; ++j)
13714 LaneMask[j] = ExtractBase + j;
13715 }
13716
13717 // Final check before we try to produce nonsense...
13718 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13719 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13720 return SDValue();
13721 }
13722
13723 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13724 for (unsigned i = 0; i < Sources.size(); ++i)
13725 ShuffleOps[i] = Sources[i].ShuffleVec;
13726
13727 SDValue Shuffle =
13728 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13729 SDValue V;
13730 if (DAG.getDataLayout().isBigEndian()) {
13731 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13732 } else {
13733 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13734 }
13735
13736 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13737 dbgs() << "Reshuffle, creating node: "; V.dump(););
13738
13739 return V;
13740}
13741
13742// check if an EXT instruction can handle the shuffle mask when the
13743// vector sources of the shuffle are the same.
13744static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13745 unsigned NumElts = VT.getVectorNumElements();
13746
13747 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13748 if (M[0] < 0)
13749 return false;
13750
13751 Imm = M[0];
13752
13753 // If this is a VEXT shuffle, the immediate value is the index of the first
13754 // element. The other shuffle indices must be the successive elements after
13755 // the first one.
13756 unsigned ExpectedElt = Imm;
13757 for (unsigned i = 1; i < NumElts; ++i) {
13758 // Increment the expected index. If it wraps around, just follow it
13759 // back to index zero and keep going.
13760 ++ExpectedElt;
13761 if (ExpectedElt == NumElts)
13762 ExpectedElt = 0;
13763
13764 if (M[i] < 0)
13765 continue; // ignore UNDEF indices
13766 if (ExpectedElt != static_cast<unsigned>(M[i]))
13767 return false;
13768 }
13769
13770 return true;
13771}
13772
13773// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13774// v4i32s. This is really a truncate, which we can construct out of (legal)
13775// concats and truncate nodes.
13777 if (V.getValueType() != MVT::v16i8)
13778 return SDValue();
13779 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13780
13781 for (unsigned X = 0; X < 4; X++) {
13782 // Check the first item in each group is an extract from lane 0 of a v4i32
13783 // or v4i16.
13784 SDValue BaseExt = V.getOperand(X * 4);
13785 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13786 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13787 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13788 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13789 BaseExt.getConstantOperandVal(1) != 0)
13790 return SDValue();
13791 SDValue Base = BaseExt.getOperand(0);
13792 // And check the other items are extracts from the same vector.
13793 for (unsigned Y = 1; Y < 4; Y++) {
13794 SDValue Ext = V.getOperand(X * 4 + Y);
13795 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13796 Ext.getOperand(0) != Base ||
13797 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13798 Ext.getConstantOperandVal(1) != Y)
13799 return SDValue();
13800 }
13801 }
13802
13803 // Turn the buildvector into a series of truncates and concates, which will
13804 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13805 // concat together to produce 2 v8i16. These are both truncated and concat
13806 // together.
13807 SDLoc DL(V);
13808 SDValue Trunc[4] = {
13809 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13810 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13811 for (SDValue &V : Trunc)
13812 if (V.getValueType() == MVT::v4i32)
13813 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13814 SDValue Concat0 =
13815 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13816 SDValue Concat1 =
13817 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13818 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13819 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13820 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13821}
13822
13823/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13824/// element width than the vector lane type. If that is the case the function
13825/// returns true and writes the value of the DUP instruction lane operand into
13826/// DupLaneOp
13827static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13828 unsigned &DupLaneOp) {
13829 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13830 "Only possible block sizes for wide DUP are: 16, 32, 64");
13831
13832 if (BlockSize <= VT.getScalarSizeInBits())
13833 return false;
13834 if (BlockSize % VT.getScalarSizeInBits() != 0)
13835 return false;
13836 if (VT.getSizeInBits() % BlockSize != 0)
13837 return false;
13838
13839 size_t SingleVecNumElements = VT.getVectorNumElements();
13840 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13841 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13842
13843 // We are looking for masks like
13844 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13845 // might be replaced by 'undefined'. BlockIndices will eventually contain
13846 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13847 // for the above examples)
13848 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13849 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13850 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13851 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13852 if (Elt < 0)
13853 continue;
13854 // For now we don't support shuffles that use the second operand
13855 if ((unsigned)Elt >= SingleVecNumElements)
13856 return false;
13857 if (BlockElts[I] < 0)
13858 BlockElts[I] = Elt;
13859 else if (BlockElts[I] != Elt)
13860 return false;
13861 }
13862
13863 // We found a candidate block (possibly with some undefs). It must be a
13864 // sequence of consecutive integers starting with a value divisible by
13865 // NumEltsPerBlock with some values possibly replaced by undef-s.
13866
13867 // Find first non-undef element
13868 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13869 assert(FirstRealEltIter != BlockElts.end() &&
13870 "Shuffle with all-undefs must have been caught by previous cases, "
13871 "e.g. isSplat()");
13872 if (FirstRealEltIter == BlockElts.end()) {
13873 DupLaneOp = 0;
13874 return true;
13875 }
13876
13877 // Index of FirstRealElt in BlockElts
13878 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13879
13880 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13881 return false;
13882 // BlockElts[0] must have the following value if it isn't undef:
13883 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13884
13885 // Check the first element
13886 if (Elt0 % NumEltsPerBlock != 0)
13887 return false;
13888 // Check that the sequence indeed consists of consecutive integers (modulo
13889 // undefs)
13890 for (size_t I = 0; I < NumEltsPerBlock; I++)
13891 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13892 return false;
13893
13894 DupLaneOp = Elt0 / NumEltsPerBlock;
13895 return true;
13896}
13897
13898// check if an EXT instruction can handle the shuffle mask when the
13899// vector sources of the shuffle are different.
13900static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13901 unsigned &Imm) {
13902 // Look for the first non-undef element.
13903 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13904
13905 // Benefit from APInt to handle overflow when calculating expected element.
13906 unsigned NumElts = VT.getVectorNumElements();
13907 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13908 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13909 /*implicitTrunc=*/true);
13910 // The following shuffle indices must be the successive elements after the
13911 // first real element.
13912 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13913 return Elt != ExpectedElt++ && Elt >= 0;
13914 });
13915 if (FoundWrongElt)
13916 return false;
13917
13918 // The index of an EXT is the first element if it is not UNDEF.
13919 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13920 // value of the first element. E.g.
13921 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13922 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13923 // ExpectedElt is the last mask index plus 1.
13924 Imm = ExpectedElt.getZExtValue();
13925
13926 // There are two difference cases requiring to reverse input vectors.
13927 // For example, for vector <4 x i32> we have the following cases,
13928 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13929 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13930 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13931 // to reverse two input vectors.
13932 if (Imm < NumElts)
13933 ReverseEXT = true;
13934 else
13935 Imm -= NumElts;
13936
13937 return true;
13938}
13939
13940/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13941/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13942/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13943static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13944 unsigned NumElts = VT.getVectorNumElements();
13945 if (NumElts % 2 != 0)
13946 return false;
13947 WhichResult = (M[0] == 0 ? 0 : 1);
13948 unsigned Idx = WhichResult * NumElts / 2;
13949 for (unsigned i = 0; i != NumElts; i += 2) {
13950 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13951 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13952 return false;
13953 Idx += 1;
13954 }
13955
13956 return true;
13957}
13958
13959/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13960/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13961/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13962static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13963 unsigned Half = VT.getVectorNumElements() / 2;
13964 WhichResult = (M[0] == 0 ? 0 : 1);
13965 for (unsigned j = 0; j != 2; ++j) {
13966 unsigned Idx = WhichResult;
13967 for (unsigned i = 0; i != Half; ++i) {
13968 int MIdx = M[i + j * Half];
13969 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13970 return false;
13971 Idx += 2;
13972 }
13973 }
13974
13975 return true;
13976}
13977
13978/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13979/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13980/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13981static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13982 unsigned NumElts = VT.getVectorNumElements();
13983 if (NumElts % 2 != 0)
13984 return false;
13985 WhichResult = (M[0] == 0 ? 0 : 1);
13986 for (unsigned i = 0; i < NumElts; i += 2) {
13987 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13988 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13989 return false;
13990 }
13991 return true;
13992}
13993
13994static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13995 bool &DstIsLeft, int &Anomaly) {
13996 if (M.size() != static_cast<size_t>(NumInputElements))
13997 return false;
13998
13999 int NumLHSMatch = 0, NumRHSMatch = 0;
14000 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14001
14002 for (int i = 0; i < NumInputElements; ++i) {
14003 if (M[i] == -1) {
14004 ++NumLHSMatch;
14005 ++NumRHSMatch;
14006 continue;
14007 }
14008
14009 if (M[i] == i)
14010 ++NumLHSMatch;
14011 else
14012 LastLHSMismatch = i;
14013
14014 if (M[i] == i + NumInputElements)
14015 ++NumRHSMatch;
14016 else
14017 LastRHSMismatch = i;
14018 }
14019
14020 if (NumLHSMatch == NumInputElements - 1) {
14021 DstIsLeft = true;
14022 Anomaly = LastLHSMismatch;
14023 return true;
14024 } else if (NumRHSMatch == NumInputElements - 1) {
14025 DstIsLeft = false;
14026 Anomaly = LastRHSMismatch;
14027 return true;
14028 }
14029
14030 return false;
14031}
14032
14033static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14034 if (VT.getSizeInBits() != 128)
14035 return false;
14036
14037 unsigned NumElts = VT.getVectorNumElements();
14038
14039 for (int I = 0, E = NumElts / 2; I != E; I++) {
14040 if (Mask[I] != I)
14041 return false;
14042 }
14043
14044 int Offset = NumElts / 2;
14045 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14046 if (Mask[I] != I + SplitLHS * Offset)
14047 return false;
14048 }
14049
14050 return true;
14051}
14052
14054 SDLoc DL(Op);
14055 EVT VT = Op.getValueType();
14056 SDValue V0 = Op.getOperand(0);
14057 SDValue V1 = Op.getOperand(1);
14058 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14059
14062 return SDValue();
14063
14064 bool SplitV0 = V0.getValueSizeInBits() == 128;
14065
14066 if (!isConcatMask(Mask, VT, SplitV0))
14067 return SDValue();
14068
14069 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14070 if (SplitV0) {
14071 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14072 DAG.getConstant(0, DL, MVT::i64));
14073 }
14074 if (V1.getValueSizeInBits() == 128) {
14075 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14076 DAG.getConstant(0, DL, MVT::i64));
14077 }
14078 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14079}
14080
14081/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14082/// the specified operations to build the shuffle. ID is the perfect-shuffle
14083//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14084//table entry and LHS/RHS are the immediate inputs for this stage of the
14085//shuffle.
14087 unsigned PFEntry, SDValue LHS,
14088 SDValue RHS, SelectionDAG &DAG,
14089 const SDLoc &DL) {
14090 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14091 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14092 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14093
14094 enum {
14095 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14096 OP_VREV,
14097 OP_VDUP0,
14098 OP_VDUP1,
14099 OP_VDUP2,
14100 OP_VDUP3,
14101 OP_VEXT1,
14102 OP_VEXT2,
14103 OP_VEXT3,
14104 OP_VUZPL, // VUZP, left result
14105 OP_VUZPR, // VUZP, right result
14106 OP_VZIPL, // VZIP, left result
14107 OP_VZIPR, // VZIP, right result
14108 OP_VTRNL, // VTRN, left result
14109 OP_VTRNR, // VTRN, right result
14110 OP_MOVLANE // Move lane. RHSID is the lane to move into
14111 };
14112
14113 if (OpNum == OP_COPY) {
14114 if (LHSID == (1 * 9 + 2) * 9 + 3)
14115 return LHS;
14116 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14117 return RHS;
14118 }
14119
14120 if (OpNum == OP_MOVLANE) {
14121 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14122 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14123 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14124 Elt = 3 - Elt;
14125 while (Elt > 0) {
14126 ID /= 9;
14127 Elt--;
14128 }
14129 return (ID % 9 == 8) ? -1 : ID % 9;
14130 };
14131
14132 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14133 // get the lane to move from the PFID, which is always from the
14134 // original vectors (V1 or V2).
14136 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14137 EVT VT = OpLHS.getValueType();
14138 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14139 unsigned ExtLane = 0;
14140 SDValue Input;
14141
14142 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14143 // convert into a higher type.
14144 if (RHSID & 0x4) {
14145 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14146 if (MaskElt == -1)
14147 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14148 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14149 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14150 Input = MaskElt < 2 ? V1 : V2;
14151 if (VT.getScalarSizeInBits() == 16) {
14152 Input = DAG.getBitcast(MVT::v2f32, Input);
14153 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14154 } else {
14155 assert(VT.getScalarSizeInBits() == 32 &&
14156 "Expected 16 or 32 bit shuffle elements");
14157 Input = DAG.getBitcast(MVT::v2f64, Input);
14158 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14159 }
14160 } else {
14161 int MaskElt = getPFIDLane(ID, RHSID);
14162 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14163 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14164 Input = MaskElt < 4 ? V1 : V2;
14165 // Be careful about creating illegal types. Use f16 instead of i16.
14166 if (VT == MVT::v4i16) {
14167 Input = DAG.getBitcast(MVT::v4f16, Input);
14168 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14169 }
14170 }
14172 Input.getValueType().getVectorElementType(),
14173 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14174 SDValue Ins =
14175 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14176 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14177 return DAG.getBitcast(VT, Ins);
14178 }
14179
14180 SDValue OpLHS, OpRHS;
14181 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14182 RHS, DAG, DL);
14183 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14184 RHS, DAG, DL);
14185 EVT VT = OpLHS.getValueType();
14186
14187 switch (OpNum) {
14188 default:
14189 llvm_unreachable("Unknown shuffle opcode!");
14190 case OP_VREV:
14191 // VREV divides the vector in half and swaps within the half.
14192 if (VT.getVectorElementType() == MVT::i32 ||
14193 VT.getVectorElementType() == MVT::f32)
14194 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14195 // vrev <4 x i16> -> REV32
14196 if (VT.getVectorElementType() == MVT::i16 ||
14197 VT.getVectorElementType() == MVT::f16 ||
14198 VT.getVectorElementType() == MVT::bf16)
14199 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14200 // vrev <4 x i8> -> REV16
14201 assert(VT.getVectorElementType() == MVT::i8);
14202 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14203 case OP_VDUP0:
14204 case OP_VDUP1:
14205 case OP_VDUP2:
14206 case OP_VDUP3: {
14207 EVT EltTy = VT.getVectorElementType();
14208 unsigned Opcode;
14209 if (EltTy == MVT::i8)
14210 Opcode = AArch64ISD::DUPLANE8;
14211 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14212 Opcode = AArch64ISD::DUPLANE16;
14213 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14214 Opcode = AArch64ISD::DUPLANE32;
14215 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14216 Opcode = AArch64ISD::DUPLANE64;
14217 else
14218 llvm_unreachable("Invalid vector element type?");
14219
14220 if (VT.getSizeInBits() == 64)
14221 OpLHS = WidenVector(OpLHS, DAG);
14222 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14223 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14224 }
14225 case OP_VEXT1:
14226 case OP_VEXT2:
14227 case OP_VEXT3: {
14228 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14229 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14230 DAG.getConstant(Imm, DL, MVT::i32));
14231 }
14232 case OP_VUZPL:
14233 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14234 case OP_VUZPR:
14235 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14236 case OP_VZIPL:
14237 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14238 case OP_VZIPR:
14239 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14240 case OP_VTRNL:
14241 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14242 case OP_VTRNR:
14243 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14244 }
14245}
14246
14248 SelectionDAG &DAG) {
14249 // Check to see if we can use the TBL instruction.
14250 SDValue V1 = Op.getOperand(0);
14251 SDValue V2 = Op.getOperand(1);
14252 SDLoc DL(Op);
14253
14254 EVT EltVT = Op.getValueType().getVectorElementType();
14255 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14256
14257 bool Swap = false;
14258 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14259 std::swap(V1, V2);
14260 Swap = true;
14261 }
14262
14263 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14264 // out of range values with 0s. We do need to make sure that any out-of-range
14265 // values are really out-of-range for a v16i8 vector.
14266 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14267 MVT IndexVT = MVT::v8i8;
14268 unsigned IndexLen = 8;
14269 if (Op.getValueSizeInBits() == 128) {
14270 IndexVT = MVT::v16i8;
14271 IndexLen = 16;
14272 }
14273
14275 for (int Val : ShuffleMask) {
14276 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14277 unsigned Offset = Byte + Val * BytesPerElt;
14278 if (Swap)
14279 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14280 if (IsUndefOrZero && Offset >= IndexLen)
14281 Offset = 255;
14282 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14283 }
14284 }
14285
14286 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14287 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14288
14289 SDValue Shuffle;
14290 if (IsUndefOrZero) {
14291 if (IndexLen == 8)
14292 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14293 Shuffle = DAG.getNode(
14294 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14295 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14296 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14297 } else {
14298 if (IndexLen == 8) {
14299 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14300 Shuffle = DAG.getNode(
14301 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14302 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14303 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14304 } else {
14305 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14306 // cannot currently represent the register constraints on the input
14307 // table registers.
14308 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14309 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14310 // IndexLen));
14311 Shuffle = DAG.getNode(
14312 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14313 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
14314 V2Cst,
14315 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14316 }
14317 }
14318 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14319}
14320
14321static unsigned getDUPLANEOp(EVT EltType) {
14322 if (EltType == MVT::i8)
14323 return AArch64ISD::DUPLANE8;
14324 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14325 return AArch64ISD::DUPLANE16;
14326 if (EltType == MVT::i32 || EltType == MVT::f32)
14327 return AArch64ISD::DUPLANE32;
14328 if (EltType == MVT::i64 || EltType == MVT::f64)
14329 return AArch64ISD::DUPLANE64;
14330
14331 llvm_unreachable("Invalid vector element type?");
14332}
14333
14334static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14335 unsigned Opcode, SelectionDAG &DAG) {
14336 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14337 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14338 // Match: dup (bitcast (extract_subv X, C)), LaneC
14339 if (BitCast.getOpcode() != ISD::BITCAST ||
14341 return false;
14342
14343 // The extract index must align in the destination type. That may not
14344 // happen if the bitcast is from narrow to wide type.
14345 SDValue Extract = BitCast.getOperand(0);
14346 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14347 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14348 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14349 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14350 if (ExtIdxInBits % CastedEltBitWidth != 0)
14351 return false;
14352
14353 // Can't handle cases where vector size is not 128-bit
14354 if (!Extract.getOperand(0).getValueType().is128BitVector())
14355 return false;
14356
14357 // Update the lane value by offsetting with the scaled extract index.
14358 LaneC += ExtIdxInBits / CastedEltBitWidth;
14359
14360 // Determine the casted vector type of the wide vector input.
14361 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14362 // Examples:
14363 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14364 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14365 unsigned SrcVecNumElts =
14366 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14368 SrcVecNumElts);
14369 return true;
14370 };
14371 MVT CastVT;
14372 if (getScaledOffsetDup(V, Lane, CastVT)) {
14373 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14374 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14375 V.getOperand(0).getValueType().is128BitVector()) {
14376 // The lane is incremented by the index of the extract.
14377 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14378 Lane += V.getConstantOperandVal(1);
14379 V = V.getOperand(0);
14380 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14381 // The lane is decremented if we are splatting from the 2nd operand.
14382 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14383 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14384 Lane -= Idx * VT.getVectorNumElements() / 2;
14385 V = WidenVector(V.getOperand(Idx), DAG);
14386 } else if (VT.getSizeInBits() == 64) {
14387 // Widen the operand to 128-bit register with undef.
14388 V = WidenVector(V, DAG);
14389 }
14390 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14391}
14392
14393// Try to widen element type to get a new mask value for a better permutation
14394// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14395// UZP1/2, TRN1/2, REV, INS, etc.
14396// For example:
14397// shufflevector <4 x i32> %a, <4 x i32> %b,
14398// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14399// is equivalent to:
14400// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14401// Finally, we can get:
14402// mov v0.d[0], v1.d[1]
14404 SDLoc DL(Op);
14405 EVT VT = Op.getValueType();
14406 EVT ScalarVT = VT.getVectorElementType();
14407 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14408 SDValue V0 = Op.getOperand(0);
14409 SDValue V1 = Op.getOperand(1);
14410 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14411
14412 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14413 // We need to make sure the wider element type is legal. Thus, ElementSize
14414 // should be not larger than 32 bits, and i1 type should also be excluded.
14415 if (ElementSize > 32 || ElementSize == 1)
14416 return SDValue();
14417
14418 SmallVector<int, 8> NewMask;
14419 if (widenShuffleMaskElts(Mask, NewMask)) {
14420 MVT NewEltVT = VT.isFloatingPoint()
14421 ? MVT::getFloatingPointVT(ElementSize * 2)
14422 : MVT::getIntegerVT(ElementSize * 2);
14423 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14424 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14425 V0 = DAG.getBitcast(NewVT, V0);
14426 V1 = DAG.getBitcast(NewVT, V1);
14427 return DAG.getBitcast(VT,
14428 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14429 }
14430 }
14431
14432 return SDValue();
14433}
14434
14435// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14437 ArrayRef<int> ShuffleMask,
14438 SelectionDAG &DAG) {
14439 SDValue Tbl1 = Op->getOperand(0);
14440 SDValue Tbl2 = Op->getOperand(1);
14441 SDLoc DL(Op);
14442 SDValue Tbl2ID =
14443 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14444
14445 EVT VT = Op.getValueType();
14446 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14447 Tbl1.getOperand(0) != Tbl2ID ||
14449 Tbl2.getOperand(0) != Tbl2ID)
14450 return SDValue();
14451
14452 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14453 return SDValue();
14454
14455 SDValue Mask1 = Tbl1.getOperand(3);
14456 SDValue Mask2 = Tbl2.getOperand(3);
14457 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14458 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14459 return SDValue();
14460
14461 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14462 for (unsigned I = 0; I < 16; I++) {
14463 if (ShuffleMask[I] < 16)
14464 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14465 else {
14466 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14467 if (!C)
14468 return SDValue();
14469 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14470 }
14471 }
14472
14473 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14474 SDValue ID =
14475 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14476
14477 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14478 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14479 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14480}
14481
14482// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14483// but we don't have an appropriate instruction,
14484// so custom-lower it as ZIP1-with-zeros.
14485SDValue
14486AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14487 SelectionDAG &DAG) const {
14488 SDLoc DL(Op);
14489 EVT VT = Op.getValueType();
14490 SDValue SrcOp = Op.getOperand(0);
14491 EVT SrcVT = SrcOp.getValueType();
14492 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14493 "Unexpected extension factor.");
14494 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14495 // FIXME: support multi-step zipping?
14496 if (Scale != 2)
14497 return SDValue();
14498 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14499 return DAG.getBitcast(VT,
14500 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14501}
14502
14503SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14504 SelectionDAG &DAG) const {
14505 SDLoc DL(Op);
14506 EVT VT = Op.getValueType();
14507
14508 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14509
14510 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14511 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14512
14513 // Convert shuffles that are directly supported on NEON to target-specific
14514 // DAG nodes, instead of keeping them as shuffles and matching them again
14515 // during code selection. This is more efficient and avoids the possibility
14516 // of inconsistencies between legalization and selection.
14517 ArrayRef<int> ShuffleMask = SVN->getMask();
14518
14519 SDValue V1 = Op.getOperand(0);
14520 SDValue V2 = Op.getOperand(1);
14521
14522 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14523 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14524 "Unexpected VECTOR_SHUFFLE mask size!");
14525
14526 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14527 return Res;
14528
14529 if (SVN->isSplat()) {
14530 int Lane = SVN->getSplatIndex();
14531 // If this is undef splat, generate it via "just" vdup, if possible.
14532 if (Lane == -1)
14533 Lane = 0;
14534
14535 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14536 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14537 V1.getOperand(0));
14538 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14539 // constant. If so, we can just reference the lane's definition directly.
14540 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14542 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14543
14544 // Otherwise, duplicate from the lane of the input vector.
14545 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14546 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14547 }
14548
14549 // Check if the mask matches a DUP for a wider element
14550 for (unsigned LaneSize : {64U, 32U, 16U}) {
14551 unsigned Lane = 0;
14552 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14553 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14554 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14555 : AArch64ISD::DUPLANE16;
14556 // Cast V1 to an integer vector with required lane size
14557 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14558 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14559 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14560 V1 = DAG.getBitcast(NewVecTy, V1);
14561 // Construct the DUP instruction
14562 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14563 // Cast back to the original type
14564 return DAG.getBitcast(VT, V1);
14565 }
14566 }
14567
14568 unsigned NumElts = VT.getVectorNumElements();
14569 unsigned EltSize = VT.getScalarSizeInBits();
14570 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14571 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14572 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14573 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14574 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14575 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14576
14577 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14578 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14579 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14580 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14581 DAG.getConstant(8, DL, MVT::i32));
14582 }
14583
14584 bool ReverseEXT = false;
14585 unsigned Imm;
14586 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14587 if (ReverseEXT)
14588 std::swap(V1, V2);
14589 Imm *= getExtFactor(V1);
14590 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14591 DAG.getConstant(Imm, DL, MVT::i32));
14592 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14593 Imm *= getExtFactor(V1);
14594 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14595 DAG.getConstant(Imm, DL, MVT::i32));
14596 }
14597
14598 unsigned WhichResult;
14599 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14600 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14601 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14602 }
14603 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14604 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14605 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14606 }
14607 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14608 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14609 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14610 }
14611
14612 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14613 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14614 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14615 }
14616 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14617 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14618 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14619 }
14620 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14621 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14622 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14623 }
14624
14626 return Concat;
14627
14628 bool DstIsLeft;
14629 int Anomaly;
14630 int NumInputElements = V1.getValueType().getVectorNumElements();
14631 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14632 SDValue DstVec = DstIsLeft ? V1 : V2;
14633 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14634
14635 SDValue SrcVec = V1;
14636 int SrcLane = ShuffleMask[Anomaly];
14637 if (SrcLane >= NumInputElements) {
14638 SrcVec = V2;
14639 SrcLane -= NumElts;
14640 }
14641 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14642
14643 EVT ScalarVT = VT.getVectorElementType();
14644
14645 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14646 ScalarVT = MVT::i32;
14647
14648 return DAG.getNode(
14649 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14650 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14651 DstLaneV);
14652 }
14653
14654 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14655 return NewSD;
14656
14657 // If the shuffle is not directly supported and it has 4 elements, use
14658 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14659 if (NumElts == 4) {
14660 unsigned PFIndexes[4];
14661 for (unsigned i = 0; i != 4; ++i) {
14662 if (ShuffleMask[i] < 0)
14663 PFIndexes[i] = 8;
14664 else
14665 PFIndexes[i] = ShuffleMask[i];
14666 }
14667
14668 // Compute the index in the perfect shuffle table.
14669 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14670 PFIndexes[2] * 9 + PFIndexes[3];
14671 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14672 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14673 DL);
14674 }
14675
14676 // Check for a "select shuffle", generating a BSL to pick between lanes in
14677 // V1/V2.
14678 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14679 assert(VT.getScalarSizeInBits() <= 32 &&
14680 "Expected larger vector element sizes to be handled already");
14681 SmallVector<SDValue> MaskElts;
14682 for (int M : ShuffleMask)
14683 MaskElts.push_back(DAG.getConstant(
14684 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14685 EVT IVT = VT.changeVectorElementTypeToInteger();
14686 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14687 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14688 DAG.getBitcast(IVT, V1),
14689 DAG.getBitcast(IVT, V2)));
14690 }
14691
14692 // Fall back to generating a TBL
14693 return GenerateTBL(Op, ShuffleMask, DAG);
14694}
14695
14696SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14697 SelectionDAG &DAG) const {
14698 EVT VT = Op.getValueType();
14699
14700 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14701 return LowerToScalableOp(Op, DAG);
14702
14703 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14704 "Unexpected vector type!");
14705
14706 // We can handle the constant cases during isel.
14707 if (isa<ConstantSDNode>(Op.getOperand(0)))
14708 return Op;
14709
14710 // There isn't a natural way to handle the general i1 case, so we use some
14711 // trickery with whilelo.
14712 SDLoc DL(Op);
14713 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14714 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14715 DAG.getValueType(MVT::i1));
14716 SDValue ID =
14717 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14718 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14719 if (VT == MVT::nxv1i1)
14720 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14721 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14722 Zero, SplatVal),
14723 Zero);
14724 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14725}
14726
14727SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14728 SelectionDAG &DAG) const {
14729 SDLoc DL(Op);
14730
14731 EVT VT = Op.getValueType();
14732 if (!isTypeLegal(VT) || !VT.isScalableVector())
14733 return SDValue();
14734
14735 // Current lowering only supports the SVE-ACLE types.
14737 return SDValue();
14738
14739 // The DUPQ operation is independent of element type so normalise to i64s.
14740 SDValue Idx128 = Op.getOperand(2);
14741
14742 // DUPQ can be used when idx is in range.
14743 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14744 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14745 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14746 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14747 }
14748
14749 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14750
14751 // The ACLE says this must produce the same result as:
14752 // svtbl(data, svadd_x(svptrue_b64(),
14753 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14754 // index * 2))
14755 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14756 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14757
14758 // create the vector 0,1,0,1,...
14759 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14760 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14761
14762 // create the vector idx64,idx64+1,idx64,idx64+1,...
14763 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14764 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14765 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14766
14767 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14768 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14769 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14770}
14771
14772
14773static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14774 APInt &UndefBits) {
14775 EVT VT = BVN->getValueType(0);
14776 APInt SplatBits, SplatUndef;
14777 unsigned SplatBitSize;
14778 bool HasAnyUndefs;
14779 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14780 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14781
14782 for (unsigned i = 0; i < NumSplats; ++i) {
14783 CnstBits <<= SplatBitSize;
14784 UndefBits <<= SplatBitSize;
14785 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14786 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14787 }
14788
14789 return true;
14790 }
14791
14792 return false;
14793}
14794
14795// Try 64-bit splatted SIMD immediate.
14796static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14797 const APInt &Bits) {
14798 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14799 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14800 EVT VT = Op.getValueType();
14801 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14802
14805
14806 SDLoc DL(Op);
14807 SDValue Mov =
14808 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14809 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14810 }
14811 }
14812
14813 return SDValue();
14814}
14815
14816// Try 32-bit splatted SIMD immediate.
14817static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14818 const APInt &Bits,
14819 const SDValue *LHS = nullptr) {
14820 EVT VT = Op.getValueType();
14821 if (VT.isFixedLengthVector() &&
14823 return SDValue();
14824
14825 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14826 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14827 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14828 bool isAdvSIMDModImm = false;
14829 uint64_t Shift;
14830
14831 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14833 Shift = 0;
14834 }
14835 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14837 Shift = 8;
14838 }
14839 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14841 Shift = 16;
14842 }
14843 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14845 Shift = 24;
14846 }
14847
14848 if (isAdvSIMDModImm) {
14849 SDLoc DL(Op);
14850 SDValue Mov;
14851
14852 if (LHS)
14853 Mov = DAG.getNode(NewOp, DL, MovTy,
14854 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14855 DAG.getConstant(Value, DL, MVT::i32),
14856 DAG.getConstant(Shift, DL, MVT::i32));
14857 else
14858 Mov =
14859 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14860 DAG.getConstant(Shift, DL, MVT::i32));
14861
14862 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14863 }
14864 }
14865
14866 return SDValue();
14867}
14868
14869// Try 16-bit splatted SIMD immediate.
14870static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14871 const APInt &Bits,
14872 const SDValue *LHS = nullptr) {
14873 EVT VT = Op.getValueType();
14874 if (VT.isFixedLengthVector() &&
14876 return SDValue();
14877
14878 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14879 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14880 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14881 bool isAdvSIMDModImm = false;
14882 uint64_t Shift;
14883
14884 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14886 Shift = 0;
14887 }
14888 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14890 Shift = 8;
14891 }
14892
14893 if (isAdvSIMDModImm) {
14894 SDLoc DL(Op);
14895 SDValue Mov;
14896
14897 if (LHS)
14898 Mov = DAG.getNode(NewOp, DL, MovTy,
14899 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14900 DAG.getConstant(Value, DL, MVT::i32),
14901 DAG.getConstant(Shift, DL, MVT::i32));
14902 else
14903 Mov =
14904 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14905 DAG.getConstant(Shift, DL, MVT::i32));
14906
14907 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14908 }
14909 }
14910
14911 return SDValue();
14912}
14913
14914// Try 32-bit splatted SIMD immediate with shifted ones.
14916 SelectionDAG &DAG, const APInt &Bits) {
14917 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14918 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14919 EVT VT = Op.getValueType();
14920 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14921 bool isAdvSIMDModImm = false;
14922 uint64_t Shift;
14923
14924 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14926 Shift = 264;
14927 }
14928 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14930 Shift = 272;
14931 }
14932
14933 if (isAdvSIMDModImm) {
14934 SDLoc DL(Op);
14935 SDValue Mov =
14936 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14937 DAG.getConstant(Shift, DL, MVT::i32));
14938 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14939 }
14940 }
14941
14942 return SDValue();
14943}
14944
14945// Try 8-bit splatted SIMD immediate.
14946static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14947 const APInt &Bits) {
14948 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14949 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14950 EVT VT = Op.getValueType();
14951 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14952
14955
14956 SDLoc DL(Op);
14957 SDValue Mov =
14958 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14959 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14960 }
14961 }
14962
14963 return SDValue();
14964}
14965
14966// Try FP splatted SIMD immediate.
14967static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14968 const APInt &Bits) {
14969 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14970 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14971 EVT VT = Op.getValueType();
14972 bool isWide = (VT.getSizeInBits() == 128);
14973 MVT MovTy;
14974 bool isAdvSIMDModImm = false;
14975
14976 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14978 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14979 }
14980 else if (isWide &&
14981 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14983 MovTy = MVT::v2f64;
14984 }
14985
14986 if (isAdvSIMDModImm) {
14987 SDLoc DL(Op);
14988 SDValue Mov =
14989 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14990 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14991 }
14992 }
14993
14994 return SDValue();
14995}
14996
14997// Specialized code to quickly find if PotentialBVec is a BuildVector that
14998// consists of only the same constant int value, returned in reference arg
14999// ConstVal
15000static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15001 uint64_t &ConstVal) {
15002 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15003 if (!Bvec)
15004 return false;
15006 if (!FirstElt)
15007 return false;
15008 EVT VT = Bvec->getValueType(0);
15009 unsigned NumElts = VT.getVectorNumElements();
15010 for (unsigned i = 1; i < NumElts; ++i)
15011 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15012 return false;
15013 ConstVal = FirstElt->getZExtValue();
15014 return true;
15015}
15016
15018 // Look through cast.
15019 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15020 N = N.getOperand(0);
15021
15022 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15023}
15024
15026 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15027
15028 // Look through cast.
15029 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15030 N = N.getOperand(0);
15031 // When reinterpreting from a type with fewer elements the "new" elements
15032 // are not active, so bail if they're likely to be used.
15033 if (N.getValueType().getVectorMinNumElements() < NumElts)
15034 return false;
15035 }
15036
15037 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15038 return true;
15039
15040 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15041 // or smaller than the implicit element type represented by N.
15042 // NOTE: A larger element count implies a smaller element type.
15043 if (N.getOpcode() == AArch64ISD::PTRUE &&
15044 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15045 return N.getValueType().getVectorMinNumElements() >= NumElts;
15046
15047 // If we're compiling for a specific vector-length, we can check if the
15048 // pattern's VL equals that of the scalable vector at runtime.
15049 if (N.getOpcode() == AArch64ISD::PTRUE) {
15050 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15051 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
15052 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
15053 if (MaxSVESize && MinSVESize == MaxSVESize) {
15054 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
15055 unsigned PatNumElts =
15056 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
15057 return PatNumElts == (NumElts * VScale);
15058 }
15059 }
15060
15061 return false;
15062}
15063
15064// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15065// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15066// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15067// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15068// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15069// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15071 EVT VT = N->getValueType(0);
15072
15073 if (!VT.isVector())
15074 return SDValue();
15075
15076 SDLoc DL(N);
15077
15078 SDValue And;
15079 SDValue Shift;
15080
15081 SDValue FirstOp = N->getOperand(0);
15082 unsigned FirstOpc = FirstOp.getOpcode();
15083 SDValue SecondOp = N->getOperand(1);
15084 unsigned SecondOpc = SecondOp.getOpcode();
15085
15086 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15087 // a BICi in order to use an immediate instead of a register.
15088 // Is the other operand an shl or lshr? This will have been turned into:
15089 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15090 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15091 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15092 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15093 SecondOpc == AArch64ISD::SHL_PRED ||
15094 SecondOpc == AArch64ISD::SRL_PRED)) {
15095 And = FirstOp;
15096 Shift = SecondOp;
15097
15098 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15099 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15100 FirstOpc == AArch64ISD::SHL_PRED ||
15101 FirstOpc == AArch64ISD::SRL_PRED)) {
15102 And = SecondOp;
15103 Shift = FirstOp;
15104 } else
15105 return SDValue();
15106
15107 bool IsAnd = And.getOpcode() == ISD::AND;
15108 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15109 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15110 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15111 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15112
15113 // Is the shift amount constant and are all lanes active?
15114 uint64_t C2;
15115 if (ShiftHasPredOp) {
15116 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15117 return SDValue();
15118 APInt C;
15120 return SDValue();
15121 C2 = C.getZExtValue();
15122 } else if (ConstantSDNode *C2node =
15124 C2 = C2node->getZExtValue();
15125 else
15126 return SDValue();
15127
15128 APInt C1AsAPInt;
15129 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15130 if (IsAnd) {
15131 // Is the and mask vector all constant?
15132 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15133 return SDValue();
15134 } else {
15135 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15136 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15137 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15138 assert(C1nodeImm && C1nodeShift);
15139 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15140 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15141 }
15142
15143 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15144 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15145 // how much one can shift elements of a particular size?
15146 if (C2 > ElemSizeInBits)
15147 return SDValue();
15148
15149 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15150 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15151 if (C1AsAPInt != RequiredC1)
15152 return SDValue();
15153
15154 SDValue X = And.getOperand(0);
15155 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15156 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15157 : Shift.getOperand(1);
15158
15159 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15160 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15161}
15162
15164 EVT VT = N->getValueType(0);
15165 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15166 SDLoc DL(N);
15167 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15168
15169 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15170 return SDValue();
15171
15172 SDValue N0 = N->getOperand(0);
15173 if (N0.getOpcode() != ISD::AND)
15174 return SDValue();
15175
15176 SDValue N1 = N->getOperand(1);
15177 if (N1.getOpcode() != ISD::AND)
15178 return SDValue();
15179
15180 // InstCombine does (not (neg a)) => (add a -1).
15181 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15182 // Loop over all combinations of AND operands.
15183 for (int i = 1; i >= 0; --i) {
15184 for (int j = 1; j >= 0; --j) {
15185 SDValue O0 = N0->getOperand(i);
15186 SDValue O1 = N1->getOperand(j);
15187 SDValue Sub, Add, SubSibling, AddSibling;
15188
15189 // Find a SUB and an ADD operand, one from each AND.
15190 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15191 Sub = O0;
15192 Add = O1;
15193 SubSibling = N0->getOperand(1 - i);
15194 AddSibling = N1->getOperand(1 - j);
15195 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15196 Add = O0;
15197 Sub = O1;
15198 AddSibling = N0->getOperand(1 - i);
15199 SubSibling = N1->getOperand(1 - j);
15200 } else
15201 continue;
15202
15203 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15204 continue;
15205
15206 // Constant ones is always righthand operand of the Add.
15207 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15208 continue;
15209
15210 if (Sub.getOperand(1) != Add.getOperand(0))
15211 continue;
15212
15213 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15214 }
15215 }
15216
15217 // (or (and a b) (and (not a) c)) => (bsl a b c)
15218 // We only have to look for constant vectors here since the general, variable
15219 // case can be handled in TableGen.
15220 unsigned Bits = VT.getScalarSizeInBits();
15221 for (int i = 1; i >= 0; --i)
15222 for (int j = 1; j >= 0; --j) {
15223 APInt Val1, Val2;
15224
15225 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15227 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15228 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15229 N0->getOperand(1 - i), N1->getOperand(1 - j));
15230 }
15233 if (!BVN0 || !BVN1)
15234 continue;
15235
15236 bool FoundMatch = true;
15237 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15240 if (!CN0 || !CN1 ||
15241 CN0->getAPIntValue().trunc(Bits) !=
15242 ~CN1->getAsAPIntVal().trunc(Bits)) {
15243 FoundMatch = false;
15244 break;
15245 }
15246 }
15247 if (FoundMatch)
15248 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15249 N0->getOperand(1 - i), N1->getOperand(1 - j));
15250 }
15251
15252 return SDValue();
15253}
15254
15255SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15256 SelectionDAG &DAG) const {
15257 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15258 !Subtarget->isNeonAvailable()))
15259 return LowerToScalableOp(Op, DAG);
15260
15261 if (SDValue Res = tryLowerToBSL(Op, DAG))
15262 return Res;
15263
15264 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15265 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15266 return Res;
15267
15268 EVT VT = Op.getValueType();
15269 if (VT.isScalableVector())
15270 return Op;
15271
15272 SDValue LHS = Op.getOperand(0);
15273 BuildVectorSDNode *BVN =
15274 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15275 if (!BVN) {
15276 // OR commutes, so try swapping the operands.
15277 LHS = Op.getOperand(1);
15278 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15279 }
15280 if (!BVN)
15281 return Op;
15282
15283 APInt DefBits(VT.getSizeInBits(), 0);
15284 APInt UndefBits(VT.getSizeInBits(), 0);
15285 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15286 SDValue NewOp;
15287
15288 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15289 DefBits, &LHS)) ||
15290 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15291 DefBits, &LHS)))
15292 return NewOp;
15293
15294 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15295 UndefBits, &LHS)) ||
15296 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15297 UndefBits, &LHS)))
15298 return NewOp;
15299 }
15300
15301 // We can always fall back to a non-immediate OR.
15302 return Op;
15303}
15304
15305// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15306// be truncated to fit element width.
15308 SelectionDAG &DAG) {
15309 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15310 SDLoc DL(Op);
15311 EVT VT = Op.getValueType();
15312 EVT EltTy= VT.getVectorElementType();
15313
15314 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15315 return Op;
15316
15318 for (SDValue Lane : Op->ops()) {
15319 // For integer vectors, type legalization would have promoted the
15320 // operands already. Otherwise, if Op is a floating-point splat
15321 // (with operands cast to integers), then the only possibilities
15322 // are constants and UNDEFs.
15323 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15324 Lane = DAG.getConstant(
15325 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15326 DL, MVT::i32);
15327 } else if (Lane.getNode()->isUndef()) {
15328 Lane = DAG.getUNDEF(MVT::i32);
15329 } else {
15330 assert(Lane.getValueType() == MVT::i32 &&
15331 "Unexpected BUILD_VECTOR operand type");
15332 }
15333 Ops.push_back(Lane);
15334 }
15335 return DAG.getBuildVector(VT, DL, Ops);
15336}
15337
15339 const AArch64Subtarget *ST, APInt &DefBits) {
15340 EVT VT = Op.getValueType();
15341 // TODO: We should be able to support 64-bit destinations too
15342 if (!ST->hasSVE() || !VT.is128BitVector() ||
15343 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15344 return SDValue();
15345
15346 // See if we can make use of the SVE dup instruction.
15347 APInt Val64 = DefBits.trunc(64);
15348 int32_t ImmVal, ShiftVal;
15349 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
15350 return SDValue();
15351
15352 SDLoc DL(Op);
15353 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15354 DAG.getConstant(Val64, DL, MVT::i64));
15355 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15356 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15357}
15358
15360 const AArch64Subtarget *ST) {
15361 EVT VT = Op.getValueType();
15362 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15363 "Expected a legal NEON vector");
15364
15365 APInt DefBits(VT.getSizeInBits(), 0);
15366 APInt UndefBits(VT.getSizeInBits(), 0);
15368 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15369 auto TryMOVIWithBits = [&](APInt DefBits) {
15370 SDValue NewOp;
15371 if ((NewOp =
15372 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15373 (NewOp =
15374 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15375 (NewOp =
15376 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15377 (NewOp =
15378 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15379 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15380 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15381 return NewOp;
15382
15383 APInt NotDefBits = ~DefBits;
15384 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15385 NotDefBits)) ||
15386 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15387 NotDefBits)) ||
15388 (NewOp =
15389 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15390 return NewOp;
15391 return SDValue();
15392 };
15393 if (SDValue R = TryMOVIWithBits(DefBits))
15394 return R;
15395 if (SDValue R = TryMOVIWithBits(UndefBits))
15396 return R;
15397
15398 // Try to materialise the constant using SVE when available.
15399 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15400 return R;
15401
15402 // See if a fneg of the constant can be materialized with a MOVI, etc
15403 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15404 // FNegate each sub-element of the constant
15405 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15406 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15407 .zext(VT.getSizeInBits());
15408 APInt NegBits(VT.getSizeInBits(), 0);
15409 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15410 for (unsigned i = 0; i < NumElts; i++)
15411 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15412 NegBits = DefBits ^ NegBits;
15413
15414 // Try to create the new constants with MOVI, and if so generate a fneg
15415 // for it.
15416 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15417 SDLoc DL(Op);
15418 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15419 return DAG.getNode(
15420 AArch64ISD::NVCAST, DL, VT,
15421 DAG.getNode(ISD::FNEG, DL, VFVT,
15422 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15423 }
15424 return SDValue();
15425 };
15426 SDValue R;
15427 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15428 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15429 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15430 return R;
15431 }
15432
15433 return SDValue();
15434}
15435
15436SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15437 SDValue Op, SelectionDAG &DAG) const {
15438 EVT VT = Op.getValueType();
15439 SDLoc DL(Op);
15440 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15441 auto *BVN = cast<BuildVectorSDNode>(Op);
15442
15443 if (auto SeqInfo = BVN->isConstantSequence()) {
15444 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15445 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15446 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15447 return convertFromScalableVector(DAG, VT, Seq);
15448 }
15449
15450 unsigned NumElems = VT.getVectorNumElements();
15451 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15452 NumElems <= 1 || BVN->isConstant())
15453 return SDValue();
15454
15455 auto IsExtractElt = [](SDValue Op) {
15456 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15457 };
15458
15459 // For integer types that are not already in vectors limit to at most four
15460 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15461 if (VT.getScalarType().isInteger() &&
15462 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15463 return SDValue();
15464
15465 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15466 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15468 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15469 return Op.isUndef() ? Undef
15470 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15471 ContainerVT, Undef, Op, ZeroI64);
15472 });
15473
15474 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15475 while (Intermediates.size() > 1) {
15476 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15477
15478 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15479 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15480 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15481 Intermediates[I / 2] =
15482 Op1.isUndef() ? Op0
15483 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15484 }
15485
15486 Intermediates.resize(Intermediates.size() / 2);
15487 ZipEC = ZipEC.divideCoefficientBy(2);
15488 }
15489
15490 assert(Intermediates.size() == 1);
15491 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15492 return convertFromScalableVector(DAG, VT, Vec);
15493}
15494
15495SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15496 SelectionDAG &DAG) const {
15497 EVT VT = Op.getValueType();
15498
15499 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15500 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15501 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15502 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15503
15504 // Try to build a simple constant vector.
15505 Op = NormalizeBuildVector(Op, DAG);
15506 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15507 // abort.
15508 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15509 return SDValue();
15510
15511 // Certain vector constants, used to express things like logical NOT and
15512 // arithmetic NEG, are passed through unmodified. This allows special
15513 // patterns for these operations to match, which will lower these constants
15514 // to whatever is proven necessary.
15515 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15516 if (BVN->isConstant()) {
15517 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15518 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15519 APInt Val(BitSize,
15520 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15521 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15522 return Op;
15523 }
15524 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15525 if (Const->isZero() && !Const->isNegative())
15526 return Op;
15527 }
15528
15529 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15530 return V;
15531
15532 // Scan through the operands to find some interesting properties we can
15533 // exploit:
15534 // 1) If only one value is used, we can use a DUP, or
15535 // 2) if only the low element is not undef, we can just insert that, or
15536 // 3) if only one constant value is used (w/ some non-constant lanes),
15537 // we can splat the constant value into the whole vector then fill
15538 // in the non-constant lanes.
15539 // 4) FIXME: If different constant values are used, but we can intelligently
15540 // select the values we'll be overwriting for the non-constant
15541 // lanes such that we can directly materialize the vector
15542 // some other way (MOVI, e.g.), we can be sneaky.
15543 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15544 SDLoc DL(Op);
15545 unsigned NumElts = VT.getVectorNumElements();
15546 bool isOnlyLowElement = true;
15547 bool usesOnlyOneValue = true;
15548 bool usesOnlyOneConstantValue = true;
15549 bool isConstant = true;
15550 bool AllLanesExtractElt = true;
15551 unsigned NumConstantLanes = 0;
15552 unsigned NumDifferentLanes = 0;
15553 unsigned NumUndefLanes = 0;
15554 SDValue Value;
15555 SDValue ConstantValue;
15556 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15557 unsigned ConsecutiveValCount = 0;
15558 SDValue PrevVal;
15559 for (unsigned i = 0; i < NumElts; ++i) {
15560 SDValue V = Op.getOperand(i);
15561 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15562 AllLanesExtractElt = false;
15563 if (V.isUndef()) {
15564 ++NumUndefLanes;
15565 continue;
15566 }
15567 if (i > 0)
15568 isOnlyLowElement = false;
15569 if (!isIntOrFPConstant(V))
15570 isConstant = false;
15571
15572 if (isIntOrFPConstant(V)) {
15573 ++NumConstantLanes;
15574 if (!ConstantValue.getNode())
15575 ConstantValue = V;
15576 else if (ConstantValue != V)
15577 usesOnlyOneConstantValue = false;
15578 }
15579
15580 if (!Value.getNode())
15581 Value = V;
15582 else if (V != Value) {
15583 usesOnlyOneValue = false;
15584 ++NumDifferentLanes;
15585 }
15586
15587 if (PrevVal != V) {
15588 ConsecutiveValCount = 0;
15589 PrevVal = V;
15590 }
15591
15592 // Keep different values and its last consecutive count. For example,
15593 //
15594 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15595 // t24, t24, t24, t24, t24, t24, t24, t24
15596 // t23 = consecutive count 8
15597 // t24 = consecutive count 8
15598 // ------------------------------------------------------------------
15599 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15600 // t24, t24, t24, t24, t24, t24, t24, t24
15601 // t23 = consecutive count 5
15602 // t24 = consecutive count 9
15603 DifferentValueMap[V] = ++ConsecutiveValCount;
15604 }
15605
15606 if (!Value.getNode()) {
15607 LLVM_DEBUG(
15608 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15609 return DAG.getUNDEF(VT);
15610 }
15611
15612 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15613 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15614 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15615 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15616 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15617 "SCALAR_TO_VECTOR node\n");
15618 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15619 }
15620
15621 if (AllLanesExtractElt) {
15622 SDNode *Vector = nullptr;
15623 bool Even = false;
15624 bool Odd = false;
15625 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15626 // the Odd pattern <1,3,5,...>.
15627 for (unsigned i = 0; i < NumElts; ++i) {
15628 SDValue V = Op.getOperand(i);
15629 const SDNode *N = V.getNode();
15630 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15631 Even = false;
15632 Odd = false;
15633 break;
15634 }
15635 SDValue N0 = N->getOperand(0);
15636
15637 // All elements are extracted from the same vector.
15638 if (!Vector) {
15639 Vector = N0.getNode();
15640 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15641 // BUILD_VECTOR.
15642 if (VT.getVectorElementType() !=
15644 break;
15645 } else if (Vector != N0.getNode()) {
15646 Odd = false;
15647 Even = false;
15648 break;
15649 }
15650
15651 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15652 // indices <1,3,5,...>.
15653 uint64_t Val = N->getConstantOperandVal(1);
15654 if (Val == 2 * i) {
15655 Even = true;
15656 continue;
15657 }
15658 if (Val - 1 == 2 * i) {
15659 Odd = true;
15660 continue;
15661 }
15662
15663 // Something does not match: abort.
15664 Odd = false;
15665 Even = false;
15666 break;
15667 }
15668 if (Even || Odd) {
15669 SDValue LHS =
15671 DAG.getConstant(0, DL, MVT::i64));
15672 SDValue RHS =
15674 DAG.getConstant(NumElts, DL, MVT::i64));
15675
15676 if (Even && !Odd)
15677 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15678 if (Odd && !Even)
15679 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15680 }
15681 }
15682
15683 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15684 // i32 and try again.
15685 if (usesOnlyOneValue) {
15686 if (!isConstant) {
15687 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15688 Value.getValueType() != VT) {
15689 LLVM_DEBUG(
15690 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15691 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15692 }
15693
15694 // This is actually a DUPLANExx operation, which keeps everything vectory.
15695
15696 SDValue Lane = Value.getOperand(1);
15697 Value = Value.getOperand(0);
15698 if (Value.getValueSizeInBits() == 64) {
15699 LLVM_DEBUG(
15700 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15701 "widening it\n");
15702 Value = WidenVector(Value, DAG);
15703 }
15704
15705 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15706 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15707 }
15708
15711 EVT EltTy = VT.getVectorElementType();
15712 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15713 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15714 LLVM_DEBUG(
15715 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15716 "BITCASTS, and try again\n");
15717 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15718 for (unsigned i = 0; i < NumElts; ++i)
15719 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15720 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15721 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15722 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15723 Val.dump(););
15724 Val = LowerBUILD_VECTOR(Val, DAG);
15725 if (Val.getNode())
15726 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15727 }
15728 }
15729
15730 // If we need to insert a small number of different non-constant elements and
15731 // the vector width is sufficiently large, prefer using DUP with the common
15732 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15733 // skip the constant lane handling below.
15734 bool PreferDUPAndInsert =
15735 !isConstant && NumDifferentLanes >= 1 &&
15736 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15737 NumDifferentLanes >= NumConstantLanes;
15738
15739 // If there was only one constant value used and for more than one lane,
15740 // start by splatting that value, then replace the non-constant lanes. This
15741 // is better than the default, which will perform a separate initialization
15742 // for each lane.
15743 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15744 // Firstly, try to materialize the splat constant.
15745 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15746 unsigned BitSize = VT.getScalarSizeInBits();
15747 APInt ConstantValueAPInt(1, 0);
15748 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15749 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15750 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15751 !ConstantValueAPInt.isAllOnes()) {
15752 Val = ConstantBuildVector(Val, DAG, Subtarget);
15753 if (!Val)
15754 // Otherwise, materialize the constant and splat it.
15755 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15756 }
15757
15758 // Now insert the non-constant lanes.
15759 for (unsigned i = 0; i < NumElts; ++i) {
15760 SDValue V = Op.getOperand(i);
15761 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15762 if (!isIntOrFPConstant(V) && !V.isUndef())
15763 // Note that type legalization likely mucked about with the VT of the
15764 // source operand, so we may have to convert it here before inserting.
15765 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15766 }
15767 return Val;
15768 }
15769
15770 // This will generate a load from the constant pool.
15771 if (isConstant) {
15772 LLVM_DEBUG(
15773 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15774 "expansion\n");
15775 return SDValue();
15776 }
15777
15778 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15779 // v4i32s. This is really a truncate, which we can construct out of (legal)
15780 // concats and truncate nodes.
15782 return M;
15783
15784 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15785 if (NumElts >= 4) {
15786 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15787 return Shuffle;
15788
15789 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15790 return Shuffle;
15791 }
15792
15793 if (PreferDUPAndInsert) {
15794 // First, build a constant vector with the common element.
15796 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15797 // Next, insert the elements that do not match the common value.
15798 for (unsigned I = 0; I < NumElts; ++I)
15799 if (Op.getOperand(I) != Value)
15800 NewVector =
15801 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15802 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15803
15804 return NewVector;
15805 }
15806
15807 // If vector consists of two different values, try to generate two DUPs and
15808 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15809 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15811 // Check the consecutive count of the value is the half number of vector
15812 // elements. In this case, we can use CONCAT_VECTORS. For example,
15813 //
15814 // canUseVECTOR_CONCAT = true;
15815 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15816 // t24, t24, t24, t24, t24, t24, t24, t24
15817 //
15818 // canUseVECTOR_CONCAT = false;
15819 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15820 // t24, t24, t24, t24, t24, t24, t24, t24
15821 bool canUseVECTOR_CONCAT = true;
15822 for (auto Pair : DifferentValueMap) {
15823 // Check different values have same length which is NumElts / 2.
15824 if (Pair.second != NumElts / 2)
15825 canUseVECTOR_CONCAT = false;
15826 Vals.push_back(Pair.first);
15827 }
15828
15829 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15830 // CONCAT_VECTORs. For example,
15831 //
15832 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15833 // t24, t24, t24, t24, t24, t24, t24, t24
15834 // ==>
15835 // t26: v8i8 = AArch64ISD::DUP t23
15836 // t28: v8i8 = AArch64ISD::DUP t24
15837 // t29: v16i8 = concat_vectors t26, t28
15838 if (canUseVECTOR_CONCAT) {
15839 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15840 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15841 SubVT.getVectorNumElements() >= 2) {
15842 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15843 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15844 SDValue DUP1 =
15845 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15846 SDValue DUP2 =
15847 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15849 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15850 return CONCAT_VECTORS;
15851 }
15852 }
15853
15854 // Let's try to generate VECTOR_SHUFFLE. For example,
15855 //
15856 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15857 // ==>
15858 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15859 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15860 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15861 if (NumElts >= 8) {
15862 SmallVector<int, 16> MaskVec;
15863 // Build mask for VECTOR_SHUFLLE.
15864 SDValue FirstLaneVal = Op.getOperand(0);
15865 for (unsigned i = 0; i < NumElts; ++i) {
15866 SDValue Val = Op.getOperand(i);
15867 if (FirstLaneVal == Val)
15868 MaskVec.push_back(i);
15869 else
15870 MaskVec.push_back(i + NumElts);
15871 }
15872
15873 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15874 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15875 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15876 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15878 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15879 return VECTOR_SHUFFLE;
15880 }
15881 }
15882
15883 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15884 // know the default expansion would otherwise fall back on something even
15885 // worse. For a vector with one or two non-undef values, that's
15886 // scalar_to_vector for the elements followed by a shuffle (provided the
15887 // shuffle is valid for the target) and materialization element by element
15888 // on the stack followed by a load for everything else.
15889 if (!isConstant && !usesOnlyOneValue) {
15890 LLVM_DEBUG(
15891 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15892 "of INSERT_VECTOR_ELT\n");
15893
15894 SDValue Vec = DAG.getUNDEF(VT);
15895 SDValue Op0 = Op.getOperand(0);
15896 unsigned i = 0;
15897
15898 // Use SCALAR_TO_VECTOR for lane zero to
15899 // a) Avoid a RMW dependency on the full vector register, and
15900 // b) Allow the register coalescer to fold away the copy if the
15901 // value is already in an S or D register, and we're forced to emit an
15902 // INSERT_SUBREG that we can't fold anywhere.
15903 //
15904 // We also allow types like i8 and i16 which are illegal scalar but legal
15905 // vector element types. After type-legalization the inserted value is
15906 // extended (i32) and it is safe to cast them to the vector type by ignoring
15907 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15908 if (!Op0.isUndef()) {
15909 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15910 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15911 ++i;
15912 }
15913 LLVM_DEBUG({
15914 if (i < NumElts)
15915 dbgs() << "Creating nodes for the other vector elements:\n";
15916 });
15917 for (; i < NumElts; ++i) {
15918 SDValue V = Op.getOperand(i);
15919 if (V.isUndef())
15920 continue;
15921 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15922 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15923 }
15924 return Vec;
15925 }
15926
15927 LLVM_DEBUG(
15928 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15929 "better alternative\n");
15930 return SDValue();
15931}
15932
15933SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15934 SelectionDAG &DAG) const {
15935 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15936 !Subtarget->isNeonAvailable()))
15937 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15938
15939 assert(Op.getValueType().isScalableVector() &&
15940 isTypeLegal(Op.getValueType()) &&
15941 "Expected legal scalable vector type!");
15942
15943 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15944 unsigned NumOperands = Op->getNumOperands();
15945 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15946 "Unexpected number of operands in CONCAT_VECTORS");
15947
15948 if (NumOperands == 2)
15949 return Op;
15950
15951 // Concat each pair of subvectors and pack into the lower half of the array.
15952 SmallVector<SDValue> ConcatOps(Op->ops());
15953 while (ConcatOps.size() > 1) {
15954 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15955 SDValue V1 = ConcatOps[I];
15956 SDValue V2 = ConcatOps[I + 1];
15957 EVT SubVT = V1.getValueType();
15958 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15959 ConcatOps[I / 2] =
15960 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15961 }
15962 ConcatOps.resize(ConcatOps.size() / 2);
15963 }
15964 return ConcatOps[0];
15965 }
15966
15967 return SDValue();
15968}
15969
15970SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15971 SelectionDAG &DAG) const {
15972 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15973
15974 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15975 !Subtarget->isNeonAvailable()))
15976 return LowerFixedLengthInsertVectorElt(Op, DAG);
15977
15978 EVT VT = Op.getOperand(0).getValueType();
15979
15980 if (VT.getScalarType() == MVT::i1) {
15981 EVT VectorVT = getPromotedVTForPredicate(VT);
15982 SDLoc DL(Op);
15983 SDValue ExtendedVector =
15984 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15985 SDValue ExtendedValue =
15986 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15987 VectorVT.getScalarType().getSizeInBits() < 32
15988 ? MVT::i32
15989 : VectorVT.getScalarType());
15990 ExtendedVector =
15991 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15992 ExtendedValue, Op.getOperand(2));
15993 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15994 }
15995
15996 // Check for non-constant or out of range lane.
15997 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15998 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15999 return SDValue();
16000
16001 return Op;
16002}
16003
16004SDValue
16005AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16006 SelectionDAG &DAG) const {
16007 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16008 EVT VT = Op.getOperand(0).getValueType();
16009
16010 if (VT.getScalarType() == MVT::i1) {
16011 // We can't directly extract from an SVE predicate; extend it first.
16012 // (This isn't the only possible lowering, but it's straightforward.)
16013 EVT VectorVT = getPromotedVTForPredicate(VT);
16014 SDLoc DL(Op);
16015 SDValue Extend =
16016 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16017 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16018 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16019 Extend, Op.getOperand(1));
16020 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16021 }
16022
16023 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16024 return LowerFixedLengthExtractVectorElt(Op, DAG);
16025
16026 // Check for non-constant or out of range lane.
16027 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16028 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16029 return SDValue();
16030
16031 // Insertion/extraction are legal for V128 types.
16032 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16033 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16034 VT == MVT::v8f16 || VT == MVT::v8bf16)
16035 return Op;
16036
16037 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16038 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16039 VT != MVT::v4bf16)
16040 return SDValue();
16041
16042 // For V64 types, we perform extraction by expanding the value
16043 // to a V128 type and perform the extraction on that.
16044 SDLoc DL(Op);
16045 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16046 EVT WideTy = WideVec.getValueType();
16047
16048 EVT ExtrTy = WideTy.getVectorElementType();
16049 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16050 ExtrTy = MVT::i32;
16051
16052 // For extractions, we just return the result directly.
16053 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16054 Op.getOperand(1));
16055}
16056
16057SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16058 SelectionDAG &DAG) const {
16059 EVT VT = Op.getValueType();
16061 "Only cases that extract a fixed length vector are supported!");
16062 EVT InVT = Op.getOperand(0).getValueType();
16063
16064 // If we don't have legal types yet, do nothing
16065 if (!isTypeLegal(InVT))
16066 return SDValue();
16067
16068 if (InVT.is128BitVector()) {
16069 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16070 unsigned Idx = Op.getConstantOperandVal(1);
16071
16072 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16073 if (Idx == 0)
16074 return Op;
16075
16076 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16077 // that directly.
16078 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16079 return Op;
16080 }
16081
16082 if (InVT.isScalableVector() ||
16083 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16084 SDLoc DL(Op);
16085 SDValue Vec = Op.getOperand(0);
16086 SDValue Idx = Op.getOperand(1);
16087
16088 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16089 if (PackedVT != InVT) {
16090 // Pack input into the bottom part of an SVE register and try again.
16091 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16092 DAG.getUNDEF(PackedVT), Vec,
16093 DAG.getVectorIdxConstant(0, DL));
16094 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16095 }
16096
16097 // This will get matched by custom code during ISelDAGToDAG.
16098 if (isNullConstant(Idx))
16099 return Op;
16100
16101 assert(InVT.isScalableVector() && "Unexpected vector type!");
16102 // Move requested subvector to the start of the vector and try again.
16103 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
16104 return convertFromScalableVector(DAG, VT, Splice);
16105 }
16106
16107 return SDValue();
16108}
16109
16110SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16111 SelectionDAG &DAG) const {
16112 assert(Op.getValueType().isScalableVector() &&
16113 "Only expect to lower inserts into scalable vectors!");
16114
16115 EVT InVT = Op.getOperand(1).getValueType();
16116 unsigned Idx = Op.getConstantOperandVal(2);
16117
16118 SDValue Vec0 = Op.getOperand(0);
16119 SDValue Vec1 = Op.getOperand(1);
16120 SDLoc DL(Op);
16121 EVT VT = Op.getValueType();
16122
16123 if (InVT.isScalableVector()) {
16124 if (!isTypeLegal(VT))
16125 return SDValue();
16126
16127 // Break down insert_subvector into simpler parts.
16128 if (VT.getVectorElementType() == MVT::i1) {
16129 unsigned NumElts = VT.getVectorMinNumElements();
16130 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16131
16132 SDValue Lo, Hi;
16133 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16134 DAG.getVectorIdxConstant(0, DL));
16135 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16136 DAG.getVectorIdxConstant(NumElts / 2, DL));
16137 if (Idx < (NumElts / 2))
16138 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16139 DAG.getVectorIdxConstant(Idx, DL));
16140 else
16141 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16142 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16143
16144 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16145 }
16146
16147 // We can select these directly.
16148 if (isTypeLegal(InVT) && Vec0.isUndef())
16149 return Op;
16150
16151 // Ensure the subvector is half the size of the main vector.
16152 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16153 return SDValue();
16154
16155 // Here narrow and wide refers to the vector element types. After "casting"
16156 // both vectors must have the same bit length and so because the subvector
16157 // has fewer elements, those elements need to be bigger.
16158 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16159 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16160
16161 // NOP cast operands to the largest legal vector of the same element count.
16162 if (VT.isFloatingPoint()) {
16163 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16164 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16165 } else {
16166 // Legal integer vectors are already their largest so Vec0 is fine as is.
16167 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16168 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16169 }
16170
16171 // To replace the top/bottom half of vector V with vector SubV we widen the
16172 // preserved half of V, concatenate this to SubV (the order depending on the
16173 // half being replaced) and then narrow the result.
16174 SDValue Narrow;
16175 if (Idx == 0) {
16176 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16177 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16178 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16179 } else {
16180 assert(Idx == InVT.getVectorMinNumElements() &&
16181 "Invalid subvector index!");
16182 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16183 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16184 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16185 }
16186
16187 return getSVESafeBitCast(VT, Narrow, DAG);
16188 }
16189
16190 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16191 // This will be matched by custom code during ISelDAGToDAG.
16192 if (Vec0.isUndef())
16193 return Op;
16194
16195 std::optional<unsigned> PredPattern =
16197 auto PredTy = VT.changeVectorElementType(MVT::i1);
16198 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16199 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16200 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16201 }
16202
16203 return SDValue();
16204}
16205
16206static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16207 if (Op.getOpcode() != AArch64ISD::DUP &&
16208 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16209 Op.getOpcode() != ISD::BUILD_VECTOR)
16210 return false;
16211
16212 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16213 !isAllConstantBuildVector(Op, SplatVal))
16214 return false;
16215
16216 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16217 !isa<ConstantSDNode>(Op->getOperand(0)))
16218 return false;
16219
16220 SplatVal = Op->getConstantOperandVal(0);
16221 if (Op.getValueType().getVectorElementType() != MVT::i64)
16222 SplatVal = (int32_t)SplatVal;
16223
16224 Negated = false;
16225 if (isPowerOf2_64(SplatVal))
16226 return true;
16227
16228 Negated = true;
16229 if (isPowerOf2_64(-SplatVal)) {
16230 SplatVal = -SplatVal;
16231 return true;
16232 }
16233
16234 return false;
16235}
16236
16237SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16238 EVT VT = Op.getValueType();
16239 SDLoc DL(Op);
16240
16241 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16242 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16243
16244 assert(VT.isScalableVector() && "Expected a scalable vector.");
16245
16246 bool Signed = Op.getOpcode() == ISD::SDIV;
16247 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16248
16249 bool Negated;
16250 uint64_t SplatVal;
16251 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
16253 SDValue Res =
16254 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16255 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16256 if (Negated)
16257 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16258
16259 return Res;
16260 }
16261
16262 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16263 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16264
16265 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16266 // operations, and truncate the result.
16267 EVT WidenedVT;
16268 if (VT == MVT::nxv16i8)
16269 WidenedVT = MVT::nxv8i16;
16270 else if (VT == MVT::nxv8i16)
16271 WidenedVT = MVT::nxv4i32;
16272 else
16273 llvm_unreachable("Unexpected Custom DIV operation");
16274
16275 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16276 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16277 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16278 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16279 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16280 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16281 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16282 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16283 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16284 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16285 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16286}
16287
16288bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16289 EVT VT, unsigned DefinedValues) const {
16290 if (!Subtarget->isNeonAvailable())
16291 return false;
16293}
16294
16296 // Currently no fixed length shuffles that require SVE are legal.
16297 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16298 return false;
16299
16300 if (VT.getVectorNumElements() == 4 &&
16301 (VT.is128BitVector() || VT.is64BitVector())) {
16302 unsigned Cost = getPerfectShuffleCost(M);
16303 if (Cost <= 1)
16304 return true;
16305 }
16306
16307 bool DummyBool;
16308 int DummyInt;
16309 unsigned DummyUnsigned;
16310
16311 unsigned EltSize = VT.getScalarSizeInBits();
16312 unsigned NumElts = VT.getVectorNumElements();
16314 isREVMask(M, EltSize, NumElts, 64) ||
16315 isREVMask(M, EltSize, NumElts, 32) ||
16316 isREVMask(M, EltSize, NumElts, 16) ||
16317 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16318 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16319 isTRNMask(M, NumElts, DummyUnsigned) ||
16320 isUZPMask(M, NumElts, DummyUnsigned) ||
16321 isZIPMask(M, NumElts, DummyUnsigned) ||
16322 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16323 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16324 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16325 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16326 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16327}
16328
16330 EVT VT) const {
16331 // Just delegate to the generic legality, clear masks aren't special.
16332 return isShuffleMaskLegal(M, VT);
16333}
16334
16335/// getVShiftImm - Check if this is a valid build_vector for the immediate
16336/// operand of a vector shift operation, where all the elements of the
16337/// build_vector must have the same constant integer value.
16338static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16339 // Ignore bit_converts.
16340 while (Op.getOpcode() == ISD::BITCAST)
16341 Op = Op.getOperand(0);
16343 APInt SplatBits, SplatUndef;
16344 unsigned SplatBitSize;
16345 bool HasAnyUndefs;
16346 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16347 HasAnyUndefs, ElementBits) ||
16348 SplatBitSize > ElementBits)
16349 return false;
16350 Cnt = SplatBits.getSExtValue();
16351 return true;
16352}
16353
16354/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16355/// operand of a vector shift left operation. That value must be in the range:
16356/// 0 <= Value < ElementBits for a left shift; or
16357/// 0 <= Value <= ElementBits for a long left shift.
16358static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16359 assert(VT.isVector() && "vector shift count is not a vector type");
16360 int64_t ElementBits = VT.getScalarSizeInBits();
16361 if (!getVShiftImm(Op, ElementBits, Cnt))
16362 return false;
16363 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16364}
16365
16366/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16367/// operand of a vector shift right operation. The value must be in the range:
16368/// 1 <= Value <= ElementBits for a right shift; or
16369static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16370 assert(VT.isVector() && "vector shift count is not a vector type");
16371 int64_t ElementBits = VT.getScalarSizeInBits();
16372 if (!getVShiftImm(Op, ElementBits, Cnt))
16373 return false;
16374 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16375}
16376
16377SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16378 SelectionDAG &DAG) const {
16379 EVT VT = Op.getValueType();
16380
16381 if (VT.getScalarType() == MVT::i1) {
16382 // Lower i1 truncate to `(x & 1) != 0`.
16383 SDLoc DL(Op);
16384 EVT OpVT = Op.getOperand(0).getValueType();
16385 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16386 SDValue One = DAG.getConstant(1, DL, OpVT);
16387 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16388 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16389 }
16390
16391 if (!VT.isVector() || VT.isScalableVector())
16392 return SDValue();
16393
16394 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16395 !Subtarget->isNeonAvailable()))
16396 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16397
16398 return SDValue();
16399}
16400
16401// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16402// possibly a truncated type, it tells how many bits of the value are to be
16403// used.
16405 SelectionDAG &DAG,
16406 unsigned &ShiftValue,
16407 SDValue &RShOperand) {
16408 if (Shift->getOpcode() != ISD::SRL)
16409 return false;
16410
16411 EVT VT = Shift.getValueType();
16412 assert(VT.isScalableVT());
16413
16414 auto ShiftOp1 =
16416 if (!ShiftOp1)
16417 return false;
16418
16419 ShiftValue = ShiftOp1->getZExtValue();
16420 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16421 return false;
16422
16423 SDValue Add = Shift->getOperand(0);
16424 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16425 return false;
16426
16428 "ResVT must be truncated or same type as the shift.");
16429 // Check if an overflow can lead to incorrect results.
16430 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16431 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16432 return false;
16433
16434 auto AddOp1 =
16436 if (!AddOp1)
16437 return false;
16438 uint64_t AddValue = AddOp1->getZExtValue();
16439 if (AddValue != 1ULL << (ShiftValue - 1))
16440 return false;
16441
16442 RShOperand = Add->getOperand(0);
16443 return true;
16444}
16445
16446SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16447 SelectionDAG &DAG) const {
16448 EVT VT = Op.getValueType();
16449 SDLoc DL(Op);
16450 int64_t Cnt;
16451
16452 if (!Op.getOperand(1).getValueType().isVector())
16453 return Op;
16454 unsigned EltSize = VT.getScalarSizeInBits();
16455
16456 switch (Op.getOpcode()) {
16457 case ISD::SHL:
16458 if (VT.isScalableVector() ||
16459 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16460 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16461
16462 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16463 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16464 DAG.getTargetConstant(Cnt, DL, MVT::i32));
16465 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
16466 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
16467 MVT::i32),
16468 Op.getOperand(0), Op.getOperand(1));
16469 case ISD::SRA:
16470 case ISD::SRL:
16471 if (VT.isScalableVector() &&
16472 (Subtarget->hasSVE2() ||
16473 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16474 SDValue RShOperand;
16475 unsigned ShiftValue;
16476 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16477 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16478 getPredicateForVector(DAG, DL, VT), RShOperand,
16479 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16480 }
16481
16482 if (VT.isScalableVector() ||
16483 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16484 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16485 : AArch64ISD::SRL_PRED;
16486 return LowerToPredicatedOp(Op, DAG, Opc);
16487 }
16488
16489 // Right shift immediate
16490 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16491 unsigned Opc =
16492 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16493 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16494 DAG.getTargetConstant(Cnt, DL, MVT::i32),
16495 Op->getFlags());
16496 }
16497
16498 // Right shift register. Note, there is not a shift right register
16499 // instruction, but the shift left register instruction takes a signed
16500 // value, where negative numbers specify a right shift.
16501 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16502 : Intrinsic::aarch64_neon_ushl;
16503 // negate the shift amount
16504 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16505 Op.getOperand(1));
16506 SDValue NegShiftLeft =
16508 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16509 NegShift);
16510 return NegShiftLeft;
16511 }
16512
16513 llvm_unreachable("unexpected shift opcode");
16514}
16515
16516SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16517 SelectionDAG &DAG) const {
16518 if (Op.getValueType().isScalableVector())
16519 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16520
16521 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16522 !Subtarget->isNeonAvailable()))
16523 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16524
16525 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16526 SDValue LHS = Op.getOperand(0);
16527 SDValue RHS = Op.getOperand(1);
16528 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16529 SDLoc DL(Op);
16530
16531 if (LHS.getValueType().getVectorElementType().isInteger())
16532 return Op;
16533
16534 assert(((!Subtarget->hasFullFP16() &&
16535 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16536 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16537 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16538 "Unexpected type!");
16539
16540 // Lower isnan(x) | isnan(never-nan) to x != x.
16541 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16542 if (CC == ISD::SETUO || CC == ISD::SETO) {
16543 bool OneNaN = false;
16544 if (LHS == RHS) {
16545 OneNaN = true;
16546 } else if (DAG.isKnownNeverNaN(RHS)) {
16547 OneNaN = true;
16548 RHS = LHS;
16549 } else if (DAG.isKnownNeverNaN(LHS)) {
16550 OneNaN = true;
16551 LHS = RHS;
16552 }
16553 if (OneNaN) {
16554 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16555 }
16556 }
16557
16558 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16559 // clean. Some of them require two branches to implement.
16560 AArch64CC::CondCode CC1, CC2;
16561 bool ShouldInvert;
16562 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16563
16564 bool NoNaNs =
16565 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16566 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16567 if (!Cmp.getNode())
16568 return SDValue();
16569
16570 if (CC2 != AArch64CC::AL) {
16571 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16572 if (!Cmp2.getNode())
16573 return SDValue();
16574
16575 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16576 }
16577
16578 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16579
16580 if (ShouldInvert)
16581 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16582
16583 return Cmp;
16584}
16585
16586static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16587 SelectionDAG &DAG) {
16588 SDValue VecOp = ScalarOp.getOperand(0);
16589 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16590 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16591 DAG.getConstant(0, DL, MVT::i64));
16592}
16593
16594static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16595 SDLoc DL, SelectionDAG &DAG) {
16596 unsigned ScalarOpcode;
16597 switch (Opcode) {
16598 case ISD::VECREDUCE_AND:
16599 ScalarOpcode = ISD::AND;
16600 break;
16601 case ISD::VECREDUCE_OR:
16602 ScalarOpcode = ISD::OR;
16603 break;
16604 case ISD::VECREDUCE_XOR:
16605 ScalarOpcode = ISD::XOR;
16606 break;
16607 default:
16608 llvm_unreachable("Expected bitwise vector reduction");
16609 return SDValue();
16610 }
16611
16612 EVT VecVT = Vec.getValueType();
16613 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16614 "Expected power-of-2 length vector");
16615
16616 EVT ElemVT = VecVT.getVectorElementType();
16617
16618 SDValue Result;
16619 unsigned NumElems = VecVT.getVectorNumElements();
16620
16621 // Special case for boolean reductions
16622 if (ElemVT == MVT::i1) {
16623 // Split large vectors into smaller ones
16624 if (NumElems > 16) {
16625 SDValue Lo, Hi;
16626 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16627 EVT HalfVT = Lo.getValueType();
16628 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16629 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16630 }
16631
16632 // Results of setcc operations get widened to 128 bits if their input
16633 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16634 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16635 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16636 // size leads to the best codegen, since e.g. setcc results might need to be
16637 // truncated otherwise.
16638 unsigned ExtendedWidth = 64;
16639 if (Vec.getOpcode() == ISD::SETCC &&
16640 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16641 ExtendedWidth = 128;
16642 }
16643 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16644
16645 // any_ext doesn't work with umin/umax, so only use it for uadd.
16646 unsigned ExtendOp =
16647 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16648 SDValue Extended = DAG.getNode(
16649 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16650 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16651 // in that case we bitcast the sign extended values from v2i64 to v4i32
16652 // before reduction for optimal code generation.
16653 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16654 NumElems == 2 && ExtendedWidth == 128) {
16655 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16656 ExtendedVT = MVT::i32;
16657 }
16658 switch (ScalarOpcode) {
16659 case ISD::AND:
16660 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16661 break;
16662 case ISD::OR:
16663 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16664 break;
16665 case ISD::XOR:
16666 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16667 break;
16668 default:
16669 llvm_unreachable("Unexpected Opcode");
16670 }
16671
16672 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16673 } else {
16674 // Iteratively split the vector in half and combine using the bitwise
16675 // operation until it fits in a 64 bit register.
16676 while (VecVT.getSizeInBits() > 64) {
16677 SDValue Lo, Hi;
16678 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16679 VecVT = Lo.getValueType();
16680 NumElems = VecVT.getVectorNumElements();
16681 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16682 }
16683
16684 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16685
16686 // Do the remaining work on a scalar since it allows the code generator to
16687 // combine the shift and bitwise operation into one instruction and since
16688 // integer instructions can have higher throughput than vector instructions.
16689 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16690
16691 // Iteratively combine the lower and upper halves of the scalar using the
16692 // bitwise operation, halving the relevant region of the scalar in each
16693 // iteration, until the relevant region is just one element of the original
16694 // vector.
16695 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16696 SDValue ShiftAmount =
16697 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16698 SDValue Shifted =
16699 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16700 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16701 }
16702
16703 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16704 }
16705
16706 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16707}
16708
16709SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16710 SelectionDAG &DAG) const {
16711 SDValue Src = Op.getOperand(0);
16712 EVT SrcVT = Src.getValueType();
16713
16714 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16715 // widening by inserting zeroes.
16716 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16717 SrcVT == MVT::v2f16) {
16718 SDLoc DL(Op);
16719 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16720 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16721 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16722 }
16723
16724 // Try to lower fixed length reductions to SVE.
16725 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16726 Op.getOpcode() == ISD::VECREDUCE_AND ||
16727 Op.getOpcode() == ISD::VECREDUCE_OR ||
16728 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16729 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16730 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16731 SrcVT.getVectorElementType() == MVT::i64);
16732 if (SrcVT.isScalableVector() ||
16734 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16735
16736 if (SrcVT.getVectorElementType() == MVT::i1)
16737 return LowerPredReductionToSVE(Op, DAG);
16738
16739 switch (Op.getOpcode()) {
16740 case ISD::VECREDUCE_ADD:
16741 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16742 case ISD::VECREDUCE_AND:
16743 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16744 case ISD::VECREDUCE_OR:
16745 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16746 case ISD::VECREDUCE_SMAX:
16747 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16748 case ISD::VECREDUCE_SMIN:
16749 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16750 case ISD::VECREDUCE_UMAX:
16751 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16752 case ISD::VECREDUCE_UMIN:
16753 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16754 case ISD::VECREDUCE_XOR:
16755 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16756 case ISD::VECREDUCE_FADD:
16757 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16758 case ISD::VECREDUCE_FMAX:
16759 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16760 case ISD::VECREDUCE_FMIN:
16761 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16762 case ISD::VECREDUCE_FMAXIMUM:
16763 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16764 case ISD::VECREDUCE_FMINIMUM:
16765 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16766 default:
16767 llvm_unreachable("Unhandled fixed length reduction");
16768 }
16769 }
16770
16771 // Lower NEON reductions.
16772 SDLoc DL(Op);
16773 switch (Op.getOpcode()) {
16774 case ISD::VECREDUCE_AND:
16775 case ISD::VECREDUCE_OR:
16776 case ISD::VECREDUCE_XOR:
16777 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16778 Op.getValueType(), DL, DAG);
16779 case ISD::VECREDUCE_ADD:
16780 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16781 case ISD::VECREDUCE_SMAX:
16782 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16783 case ISD::VECREDUCE_SMIN:
16784 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16785 case ISD::VECREDUCE_UMAX:
16786 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16787 case ISD::VECREDUCE_UMIN:
16788 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16789 default:
16790 llvm_unreachable("Unhandled reduction");
16791 }
16792}
16793
16794SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16795 SelectionDAG &DAG) const {
16796 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16797 // No point replacing if we don't have the relevant instruction/libcall anyway
16798 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16799 return SDValue();
16800
16801 // LSE has an atomic load-clear instruction, but not a load-and.
16802 SDLoc DL(Op);
16803 MVT VT = Op.getSimpleValueType();
16804 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16805 SDValue RHS = Op.getOperand(2);
16806 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16807 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16808 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16809 Op.getOperand(0), Op.getOperand(1), RHS,
16810 AN->getMemOperand());
16811}
16812
16813SDValue
16814AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16815 SelectionDAG &DAG) const {
16816
16817 SDLoc DL(Op);
16818 // Get the inputs.
16819 SDNode *Node = Op.getNode();
16820 SDValue Chain = Op.getOperand(0);
16821 SDValue Size = Op.getOperand(1);
16822 MaybeAlign Align =
16823 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16824 EVT VT = Node->getValueType(0);
16825
16827 "no-stack-arg-probe")) {
16828 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16829 Chain = SP.getValue(1);
16830 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16831 if (Align)
16832 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16833 DAG.getSignedConstant(-Align->value(), DL, VT));
16834 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16835 SDValue Ops[2] = {SP, Chain};
16836 return DAG.getMergeValues(Ops, DL);
16837 }
16838
16839 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16840
16841 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16842 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
16843 PtrVT, 0);
16844
16845 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16846 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16847 if (Subtarget->hasCustomCallingConv())
16848 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16849
16850 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16851 DAG.getConstant(4, DL, MVT::i64));
16852 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16853 Chain =
16854 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16855 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16856 DAG.getRegisterMask(Mask), Chain.getValue(1));
16857 // To match the actual intent better, we should read the output from X15 here
16858 // again (instead of potentially spilling it to the stack), but rereading Size
16859 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16860 // here.
16861
16862 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16863 DAG.getConstant(4, DL, MVT::i64));
16864
16865 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16866 Chain = SP.getValue(1);
16867 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16868 if (Align)
16869 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16870 DAG.getSignedConstant(-Align->value(), DL, VT));
16871 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16872
16873 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16874
16875 SDValue Ops[2] = {SP, Chain};
16876 return DAG.getMergeValues(Ops, DL);
16877}
16878
16879SDValue
16880AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16881 SelectionDAG &DAG) const {
16882 // Get the inputs.
16883 SDNode *Node = Op.getNode();
16884 SDValue Chain = Op.getOperand(0);
16885 SDValue Size = Op.getOperand(1);
16886
16887 MaybeAlign Align =
16888 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16889 SDLoc DL(Op);
16890 EVT VT = Node->getValueType(0);
16891
16892 // Construct the new SP value in a GPR.
16893 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16894 Chain = SP.getValue(1);
16895 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16896 if (Align)
16897 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16898 DAG.getSignedConstant(-Align->value(), DL, VT));
16899
16900 // Set the real SP to the new value with a probing loop.
16901 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16902 SDValue Ops[2] = {SP, Chain};
16903 return DAG.getMergeValues(Ops, DL);
16904}
16905
16906SDValue
16907AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16908 SelectionDAG &DAG) const {
16909 MachineFunction &MF = DAG.getMachineFunction();
16910
16911 if (Subtarget->isTargetWindows())
16912 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16913 else if (hasInlineStackProbe(MF))
16914 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16915 else
16916 return SDValue();
16917}
16918
16919SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16920 unsigned NewOp) const {
16921 if (Subtarget->hasSVE2())
16922 return LowerToPredicatedOp(Op, DAG, NewOp);
16923
16924 // Default to expand.
16925 return SDValue();
16926}
16927
16928SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16929 SelectionDAG &DAG) const {
16930 EVT VT = Op.getValueType();
16931 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16932
16933 SDLoc DL(Op);
16934 APInt MulImm = Op.getConstantOperandAPInt(0);
16935 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16936 VT);
16937}
16938
16939/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16940template <unsigned NumVecs>
16941static bool
16945 // Retrieve EC from first vector argument.
16946 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16948#ifndef NDEBUG
16949 // Check the assumption that all input vectors are the same type.
16950 for (unsigned I = 0; I < NumVecs; ++I)
16951 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16952 "Invalid type.");
16953#endif
16954 // memVT is `NumVecs * VT`.
16956 EC * NumVecs);
16957 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16958 Info.offset = 0;
16959 Info.align.reset();
16961 return true;
16962}
16963
16964/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16965/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16966/// specified in the intrinsic calls.
16968 const CallInst &I,
16969 MachineFunction &MF,
16970 unsigned Intrinsic) const {
16971 auto &DL = I.getDataLayout();
16972 switch (Intrinsic) {
16973 case Intrinsic::aarch64_sve_st2:
16974 return setInfoSVEStN<2>(*this, DL, Info, I);
16975 case Intrinsic::aarch64_sve_st3:
16976 return setInfoSVEStN<3>(*this, DL, Info, I);
16977 case Intrinsic::aarch64_sve_st4:
16978 return setInfoSVEStN<4>(*this, DL, Info, I);
16979 case Intrinsic::aarch64_neon_ld2:
16980 case Intrinsic::aarch64_neon_ld3:
16981 case Intrinsic::aarch64_neon_ld4:
16982 case Intrinsic::aarch64_neon_ld1x2:
16983 case Intrinsic::aarch64_neon_ld1x3:
16984 case Intrinsic::aarch64_neon_ld1x4: {
16985 Info.opc = ISD::INTRINSIC_W_CHAIN;
16986 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16987 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16988 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16989 Info.offset = 0;
16990 Info.align.reset();
16991 // volatile loads with NEON intrinsics not supported
16992 Info.flags = MachineMemOperand::MOLoad;
16993 return true;
16994 }
16995 case Intrinsic::aarch64_neon_ld2lane:
16996 case Intrinsic::aarch64_neon_ld3lane:
16997 case Intrinsic::aarch64_neon_ld4lane:
16998 case Intrinsic::aarch64_neon_ld2r:
16999 case Intrinsic::aarch64_neon_ld3r:
17000 case Intrinsic::aarch64_neon_ld4r: {
17001 Info.opc = ISD::INTRINSIC_W_CHAIN;
17002 // ldx return struct with the same vec type
17003 Type *RetTy = I.getType();
17004 auto *StructTy = cast<StructType>(RetTy);
17005 unsigned NumElts = StructTy->getNumElements();
17006 Type *VecTy = StructTy->getElementType(0);
17007 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17008 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17009 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17010 Info.offset = 0;
17011 Info.align.reset();
17012 // volatile loads with NEON intrinsics not supported
17013 Info.flags = MachineMemOperand::MOLoad;
17014 return true;
17015 }
17016 case Intrinsic::aarch64_neon_st2:
17017 case Intrinsic::aarch64_neon_st3:
17018 case Intrinsic::aarch64_neon_st4:
17019 case Intrinsic::aarch64_neon_st1x2:
17020 case Intrinsic::aarch64_neon_st1x3:
17021 case Intrinsic::aarch64_neon_st1x4: {
17022 Info.opc = ISD::INTRINSIC_VOID;
17023 unsigned NumElts = 0;
17024 for (const Value *Arg : I.args()) {
17025 Type *ArgTy = Arg->getType();
17026 if (!ArgTy->isVectorTy())
17027 break;
17028 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17029 }
17030 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17031 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17032 Info.offset = 0;
17033 Info.align.reset();
17034 // volatile stores with NEON intrinsics not supported
17035 Info.flags = MachineMemOperand::MOStore;
17036 return true;
17037 }
17038 case Intrinsic::aarch64_neon_st2lane:
17039 case Intrinsic::aarch64_neon_st3lane:
17040 case Intrinsic::aarch64_neon_st4lane: {
17041 Info.opc = ISD::INTRINSIC_VOID;
17042 unsigned NumElts = 0;
17043 // all the vector type is same
17044 Type *VecTy = I.getArgOperand(0)->getType();
17045 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17046
17047 for (const Value *Arg : I.args()) {
17048 Type *ArgTy = Arg->getType();
17049 if (!ArgTy->isVectorTy())
17050 break;
17051 NumElts += 1;
17052 }
17053
17054 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17055 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17056 Info.offset = 0;
17057 Info.align.reset();
17058 // volatile stores with NEON intrinsics not supported
17059 Info.flags = MachineMemOperand::MOStore;
17060 return true;
17061 }
17062 case Intrinsic::aarch64_ldaxr:
17063 case Intrinsic::aarch64_ldxr: {
17064 Type *ValTy = I.getParamElementType(0);
17065 Info.opc = ISD::INTRINSIC_W_CHAIN;
17066 Info.memVT = MVT::getVT(ValTy);
17067 Info.ptrVal = I.getArgOperand(0);
17068 Info.offset = 0;
17069 Info.align = DL.getABITypeAlign(ValTy);
17071 return true;
17072 }
17073 case Intrinsic::aarch64_stlxr:
17074 case Intrinsic::aarch64_stxr: {
17075 Type *ValTy = I.getParamElementType(1);
17076 Info.opc = ISD::INTRINSIC_W_CHAIN;
17077 Info.memVT = MVT::getVT(ValTy);
17078 Info.ptrVal = I.getArgOperand(1);
17079 Info.offset = 0;
17080 Info.align = DL.getABITypeAlign(ValTy);
17082 return true;
17083 }
17084 case Intrinsic::aarch64_ldaxp:
17085 case Intrinsic::aarch64_ldxp:
17086 Info.opc = ISD::INTRINSIC_W_CHAIN;
17087 Info.memVT = MVT::i128;
17088 Info.ptrVal = I.getArgOperand(0);
17089 Info.offset = 0;
17090 Info.align = Align(16);
17092 return true;
17093 case Intrinsic::aarch64_stlxp:
17094 case Intrinsic::aarch64_stxp:
17095 Info.opc = ISD::INTRINSIC_W_CHAIN;
17096 Info.memVT = MVT::i128;
17097 Info.ptrVal = I.getArgOperand(2);
17098 Info.offset = 0;
17099 Info.align = Align(16);
17101 return true;
17102 case Intrinsic::aarch64_sve_ldnt1: {
17103 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17104 Info.opc = ISD::INTRINSIC_W_CHAIN;
17105 Info.memVT = MVT::getVT(I.getType());
17106 Info.ptrVal = I.getArgOperand(1);
17107 Info.offset = 0;
17108 Info.align = DL.getABITypeAlign(ElTy);
17110 return true;
17111 }
17112 case Intrinsic::aarch64_sve_stnt1: {
17113 Type *ElTy =
17114 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17115 Info.opc = ISD::INTRINSIC_W_CHAIN;
17116 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17117 Info.ptrVal = I.getArgOperand(2);
17118 Info.offset = 0;
17119 Info.align = DL.getABITypeAlign(ElTy);
17121 return true;
17122 }
17123 case Intrinsic::aarch64_mops_memset_tag: {
17124 Value *Dst = I.getArgOperand(0);
17125 Value *Val = I.getArgOperand(1);
17126 Info.opc = ISD::INTRINSIC_W_CHAIN;
17127 Info.memVT = MVT::getVT(Val->getType());
17128 Info.ptrVal = Dst;
17129 Info.offset = 0;
17130 Info.align = I.getParamAlign(0).valueOrOne();
17131 Info.flags = MachineMemOperand::MOStore;
17132 // The size of the memory being operated on is unknown at this point
17133 Info.size = MemoryLocation::UnknownSize;
17134 return true;
17135 }
17136 default:
17137 break;
17138 }
17139
17140 return false;
17141}
17142
17144 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17145 std::optional<unsigned> ByteOffset) const {
17146 // TODO: This may be worth removing. Check regression tests for diffs.
17147 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17148 ByteOffset))
17149 return false;
17150
17151 // If we're reducing the load width in order to avoid having to use an extra
17152 // instruction to do extension then it's probably a good idea.
17153 if (ExtTy != ISD::NON_EXTLOAD)
17154 return true;
17155 // Don't reduce load width if it would prevent us from combining a shift into
17156 // the offset.
17157 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17158 assert(Mem);
17159 const SDValue &Base = Mem->getBasePtr();
17160 if (Base.getOpcode() == ISD::ADD &&
17161 Base.getOperand(1).getOpcode() == ISD::SHL &&
17162 Base.getOperand(1).hasOneUse() &&
17163 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17164 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17165 if (Mem->getMemoryVT().isScalableVector())
17166 return false;
17167 // The shift can be combined if it matches the size of the value being
17168 // loaded (and so reducing the width would make it not match).
17169 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17170 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17171 if (ShiftAmount == Log2_32(LoadBytes))
17172 return false;
17173 }
17174 // We have no reason to disallow reducing the load width, so allow it.
17175 return true;
17176}
17177
17178// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17180 EVT VT = Extend.getValueType();
17181 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17182 SDValue Extract = Extend.getOperand(0);
17183 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17184 Extract = Extract.getOperand(0);
17185 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17186 EVT VecVT = Extract.getOperand(0).getValueType();
17187 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17188 return false;
17189 }
17190 }
17191 return true;
17192}
17193
17194// Truncations from 64-bit GPR to 32-bit GPR is free.
17196 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17197 return false;
17198 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17199 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17200 return NumBits1 > NumBits2;
17201}
17203 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17204 return false;
17205 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17206 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17207 return NumBits1 > NumBits2;
17208}
17209
17210/// Check if it is profitable to hoist instruction in then/else to if.
17211/// Not profitable if I and it's user can form a FMA instruction
17212/// because we prefer FMSUB/FMADD.
17214 if (I->getOpcode() != Instruction::FMul)
17215 return true;
17216
17217 if (!I->hasOneUse())
17218 return true;
17219
17220 Instruction *User = I->user_back();
17221
17222 if (!(User->getOpcode() == Instruction::FSub ||
17223 User->getOpcode() == Instruction::FAdd))
17224 return true;
17225
17227 const Function *F = I->getFunction();
17228 const DataLayout &DL = F->getDataLayout();
17229 Type *Ty = User->getOperand(0)->getType();
17230
17231 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17233 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17234 I->getFastMathFlags().allowContract()));
17235}
17236
17237// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17238// 64-bit GPR.
17240 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17241 return false;
17242 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17243 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17244 return NumBits1 == 32 && NumBits2 == 64;
17245}
17247 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17248 return false;
17249 unsigned NumBits1 = VT1.getSizeInBits();
17250 unsigned NumBits2 = VT2.getSizeInBits();
17251 return NumBits1 == 32 && NumBits2 == 64;
17252}
17253
17255 EVT VT1 = Val.getValueType();
17256 if (isZExtFree(VT1, VT2)) {
17257 return true;
17258 }
17259
17260 if (Val.getOpcode() != ISD::LOAD)
17261 return false;
17262
17263 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17264 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17265 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17266 VT1.getSizeInBits() <= 32);
17267}
17268
17269bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17270 if (isa<FPExtInst>(Ext))
17271 return false;
17272
17273 // Vector types are not free.
17274 if (Ext->getType()->isVectorTy())
17275 return false;
17276
17277 for (const Use &U : Ext->uses()) {
17278 // The extension is free if we can fold it with a left shift in an
17279 // addressing mode or an arithmetic operation: add, sub, and cmp.
17280
17281 // Is there a shift?
17282 const Instruction *Instr = cast<Instruction>(U.getUser());
17283
17284 // Is this a constant shift?
17285 switch (Instr->getOpcode()) {
17286 case Instruction::Shl:
17287 if (!isa<ConstantInt>(Instr->getOperand(1)))
17288 return false;
17289 break;
17290 case Instruction::GetElementPtr: {
17291 gep_type_iterator GTI = gep_type_begin(Instr);
17292 auto &DL = Ext->getDataLayout();
17293 std::advance(GTI, U.getOperandNo()-1);
17294 Type *IdxTy = GTI.getIndexedType();
17295 // This extension will end up with a shift because of the scaling factor.
17296 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17297 // Get the shift amount based on the scaling factor:
17298 // log2(sizeof(IdxTy)) - log2(8).
17299 if (IdxTy->isScalableTy())
17300 return false;
17301 uint64_t ShiftAmt =
17302 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17303 3;
17304 // Is the constant foldable in the shift of the addressing mode?
17305 // I.e., shift amount is between 1 and 4 inclusive.
17306 if (ShiftAmt == 0 || ShiftAmt > 4)
17307 return false;
17308 break;
17309 }
17310 case Instruction::Trunc:
17311 // Check if this is a noop.
17312 // trunc(sext ty1 to ty2) to ty1.
17313 if (Instr->getType() == Ext->getOperand(0)->getType())
17314 continue;
17315 [[fallthrough]];
17316 default:
17317 return false;
17318 }
17319
17320 // At this point we can use the bfm family, so this extension is free
17321 // for that use.
17322 }
17323 return true;
17324}
17325
17326static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17327 unsigned NumElts, bool IsLittleEndian,
17328 SmallVectorImpl<int> &Mask) {
17329 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17330 return false;
17331
17332 assert(DstWidth % SrcWidth == 0 &&
17333 "TBL lowering is not supported for a conversion instruction with this "
17334 "source and destination element type.");
17335
17336 unsigned Factor = DstWidth / SrcWidth;
17337 unsigned MaskLen = NumElts * Factor;
17338
17339 Mask.clear();
17340 Mask.resize(MaskLen, NumElts);
17341
17342 unsigned SrcIndex = 0;
17343 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17344 Mask[I] = SrcIndex++;
17345
17346 return true;
17347}
17348
17350 FixedVectorType *ZExtTy,
17351 FixedVectorType *DstTy,
17352 bool IsLittleEndian) {
17353 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17354 unsigned NumElts = SrcTy->getNumElements();
17355 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17356 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17357
17358 SmallVector<int> Mask;
17359 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17360 return nullptr;
17361
17362 auto *FirstEltZero = Builder.CreateInsertElement(
17363 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17364 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17365 Result = Builder.CreateBitCast(Result, DstTy);
17366 if (DstTy != ZExtTy)
17367 Result = Builder.CreateZExt(Result, ZExtTy);
17368 return Result;
17369}
17370
17372 FixedVectorType *DstTy,
17373 bool IsLittleEndian) {
17374 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17375 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17376 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17377
17378 SmallVector<int> Mask;
17379 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17380 !IsLittleEndian, Mask))
17381 return nullptr;
17382
17383 auto *FirstEltZero = Builder.CreateInsertElement(
17384 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17385
17386 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17387}
17388
17389static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17390 IRBuilder<> Builder(TI);
17392 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17393 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17394 auto *DstTy = cast<FixedVectorType>(TI->getType());
17395 assert(SrcTy->getElementType()->isIntegerTy() &&
17396 "Non-integer type source vector element is not supported");
17397 assert(DstTy->getElementType()->isIntegerTy(8) &&
17398 "Unsupported destination vector element type");
17399 unsigned SrcElemTySz =
17400 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17401 unsigned DstElemTySz =
17402 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17403 assert((SrcElemTySz % DstElemTySz == 0) &&
17404 "Cannot lower truncate to tbl instructions for a source element size "
17405 "that is not divisible by the destination element size");
17406 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17407 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17408 "Unsupported source vector element type size");
17409 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17410
17411 // Create a mask to choose every nth byte from the source vector table of
17412 // bytes to create the truncated destination vector, where 'n' is the truncate
17413 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17414 // 0,8,16,..Y*8th bytes for the little-endian format
17416 for (int Itr = 0; Itr < 16; Itr++) {
17417 if (Itr < NumElements)
17418 MaskConst.push_back(Builder.getInt8(
17419 IsLittleEndian ? Itr * TruncFactor
17420 : Itr * TruncFactor + (TruncFactor - 1)));
17421 else
17422 MaskConst.push_back(Builder.getInt8(255));
17423 }
17424
17425 int MaxTblSz = 128 * 4;
17426 int MaxSrcSz = SrcElemTySz * NumElements;
17427 int ElemsPerTbl =
17428 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17429 assert(ElemsPerTbl <= 16 &&
17430 "Maximum elements selected using TBL instruction cannot exceed 16!");
17431
17432 int ShuffleCount = 128 / SrcElemTySz;
17433 SmallVector<int> ShuffleLanes;
17434 for (int i = 0; i < ShuffleCount; ++i)
17435 ShuffleLanes.push_back(i);
17436
17437 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17438 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17439 // call TBL & save the result in a vector of TBL results for combining later.
17441 while (ShuffleLanes.back() < NumElements) {
17442 Parts.push_back(Builder.CreateBitCast(
17443 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17444
17445 if (Parts.size() == 4) {
17446 Parts.push_back(ConstantVector::get(MaskConst));
17447 Results.push_back(
17448 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17449 Parts.clear();
17450 }
17451
17452 for (int i = 0; i < ShuffleCount; ++i)
17453 ShuffleLanes[i] += ShuffleCount;
17454 }
17455
17456 assert((Parts.empty() || Results.empty()) &&
17457 "Lowering trunc for vectors requiring different TBL instructions is "
17458 "not supported!");
17459 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17460 // registers
17461 if (!Parts.empty()) {
17462 Intrinsic::ID TblID;
17463 switch (Parts.size()) {
17464 case 1:
17465 TblID = Intrinsic::aarch64_neon_tbl1;
17466 break;
17467 case 2:
17468 TblID = Intrinsic::aarch64_neon_tbl2;
17469 break;
17470 case 3:
17471 TblID = Intrinsic::aarch64_neon_tbl3;
17472 break;
17473 }
17474
17475 Parts.push_back(ConstantVector::get(MaskConst));
17476 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17477 }
17478
17479 // Extract the destination vector from TBL result(s) after combining them
17480 // where applicable. Currently, at most two TBLs are supported.
17481 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17482 "more than 2 tbl instructions!");
17483 Value *FinalResult = Results[0];
17484 if (Results.size() == 1) {
17485 if (ElemsPerTbl < 16) {
17486 SmallVector<int> FinalMask(ElemsPerTbl);
17487 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17488 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17489 }
17490 } else {
17491 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17492 if (ElemsPerTbl < 16) {
17493 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17494 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17495 } else {
17496 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17497 }
17498 FinalResult =
17499 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17500 }
17501
17502 TI->replaceAllUsesWith(FinalResult);
17503 TI->eraseFromParent();
17504}
17505
17507 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17508 // shuffle_vector instructions are serialized when targeting SVE,
17509 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17510 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17511 return false;
17512
17513 // Try to optimize conversions using tbl. This requires materializing constant
17514 // index vectors, which can increase code size and add loads. Skip the
17515 // transform unless the conversion is in a loop block guaranteed to execute
17516 // and we are not optimizing for size.
17517 Function *F = I->getParent()->getParent();
17518 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17519 return false;
17520
17521 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17522 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17523 if (!SrcTy || !DstTy)
17524 return false;
17525
17526 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17527 // lowered to tbl instructions to insert the original i8 elements
17528 // into i8x lanes. This is enabled for cases where it is beneficial.
17529 auto *ZExt = dyn_cast<ZExtInst>(I);
17530 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17531 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17532 if (DstWidth % 8 != 0)
17533 return false;
17534
17535 auto *TruncDstType =
17537 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17538 // the remaining ZExt folded into the user, don't use tbl lowering.
17539 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17540 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17543 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17544 return false;
17545
17546 DstTy = TruncDstType;
17547 }
17548
17549 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17550 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17551 // most one extra extend step is needed and using tbl is not profitable.
17552 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17553 // udot instruction.
17554 if (SrcWidth * 4 <= DstWidth) {
17555 if (all_of(I->users(), [&](auto *U) {
17556 auto *SingleUser = cast<Instruction>(&*U);
17557 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17558 return true;
17559 if (match(SingleUser,
17560 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
17561 m_Value(), m_Specific(I))))
17562 return true;
17563 return false;
17564 }))
17565 return false;
17566 }
17567
17568 if (DstTy->getScalarSizeInBits() >= 64)
17569 return false;
17570
17571 IRBuilder<> Builder(ZExt);
17573 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17574 DstTy, Subtarget->isLittleEndian());
17575 if (!Result)
17576 return false;
17577 ZExt->replaceAllUsesWith(Result);
17578 ZExt->eraseFromParent();
17579 return true;
17580 }
17581
17582 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17583 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17584 DstTy->getElementType()->isFloatTy()) ||
17585 (SrcTy->getElementType()->isIntegerTy(16) &&
17586 DstTy->getElementType()->isDoubleTy()))) {
17587 IRBuilder<> Builder(I);
17589 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17590 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17591 assert(ZExt && "Cannot fail for the i8 to float conversion");
17592 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17593 I->replaceAllUsesWith(UI);
17594 I->eraseFromParent();
17595 return true;
17596 }
17597
17598 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17599 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17600 DstTy->getElementType()->isFloatTy()) {
17601 IRBuilder<> Builder(I);
17602 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17604 Subtarget->isLittleEndian());
17605 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17606 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17607 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17608 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17609 I->replaceAllUsesWith(SI);
17610 I->eraseFromParent();
17611 return true;
17612 }
17613
17614 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17615 // followed by a truncate lowered to using tbl.4.
17616 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17617 if (FPToUI &&
17618 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17619 SrcTy->getElementType()->isFloatTy() &&
17620 DstTy->getElementType()->isIntegerTy(8)) {
17621 IRBuilder<> Builder(I);
17622 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17623 VectorType::getInteger(SrcTy));
17624 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17625 I->replaceAllUsesWith(TruncI);
17626 I->eraseFromParent();
17627 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17628 return true;
17629 }
17630
17631 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17632 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17633 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17634 // registers
17635 auto *TI = dyn_cast<TruncInst>(I);
17636 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17637 ((SrcTy->getElementType()->isIntegerTy(32) ||
17638 SrcTy->getElementType()->isIntegerTy(64)) &&
17639 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17640 createTblForTrunc(TI, Subtarget->isLittleEndian());
17641 return true;
17642 }
17643
17644 return false;
17645}
17646
17648 Align &RequiredAlignment) const {
17649 if (!LoadedType.isSimple() ||
17650 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17651 return false;
17652 // Cyclone supports unaligned accesses.
17653 RequiredAlignment = Align(1);
17654 unsigned NumBits = LoadedType.getSizeInBits();
17655 return NumBits == 32 || NumBits == 64;
17656}
17657
17658/// A helper function for determining the number of interleaved accesses we
17659/// will generate when lowering accesses of the given type.
17661 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17662 unsigned VecSize = 128;
17663 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17664 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17665 if (UseScalable && isa<FixedVectorType>(VecTy))
17666 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17667 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17668}
17669
17672 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17673 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17674 return MOStridedAccess;
17676}
17677
17679 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17680 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17681 auto EC = VecTy->getElementCount();
17682 unsigned MinElts = EC.getKnownMinValue();
17683
17684 UseScalable = false;
17685
17686 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17687 (!Subtarget->useSVEForFixedLengthVectors() ||
17689 return false;
17690
17691 if (isa<ScalableVectorType>(VecTy) &&
17692 !Subtarget->isSVEorStreamingSVEAvailable())
17693 return false;
17694
17695 // Ensure the number of vector elements is greater than 1.
17696 if (MinElts < 2)
17697 return false;
17698
17699 // Ensure the element type is legal.
17700 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17701 return false;
17702
17703 if (EC.isScalable()) {
17704 UseScalable = true;
17705 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17706 }
17707
17708 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17709 if (Subtarget->useSVEForFixedLengthVectors()) {
17710 unsigned MinSVEVectorSize =
17711 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17712 if (VecSize % MinSVEVectorSize == 0 ||
17713 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17714 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17715 UseScalable = true;
17716 return true;
17717 }
17718 }
17719
17720 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17721 // 128 will be split into multiple interleaved accesses.
17722 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17723}
17724
17726 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17727 return ScalableVectorType::get(VTy->getElementType(), 2);
17728
17729 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17730 return ScalableVectorType::get(VTy->getElementType(), 4);
17731
17732 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17733 return ScalableVectorType::get(VTy->getElementType(), 8);
17734
17735 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17736 return ScalableVectorType::get(VTy->getElementType(), 8);
17737
17738 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17739 return ScalableVectorType::get(VTy->getElementType(), 2);
17740
17741 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17742 return ScalableVectorType::get(VTy->getElementType(), 4);
17743
17744 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17745 return ScalableVectorType::get(VTy->getElementType(), 8);
17746
17747 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17748 return ScalableVectorType::get(VTy->getElementType(), 16);
17749
17750 llvm_unreachable("Cannot handle input vector type");
17751}
17752
17753static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17754 bool Scalable, Type *LDVTy,
17755 Type *PtrTy) {
17756 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17757 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17758 Intrinsic::aarch64_sve_ld3_sret,
17759 Intrinsic::aarch64_sve_ld4_sret};
17760 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17761 Intrinsic::aarch64_neon_ld3,
17762 Intrinsic::aarch64_neon_ld4};
17763 if (Scalable)
17764 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17765
17766 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17767 {LDVTy, PtrTy});
17768}
17769
17770static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17771 bool Scalable, Type *STVTy,
17772 Type *PtrTy) {
17773 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17774 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17775 Intrinsic::aarch64_sve_st3,
17776 Intrinsic::aarch64_sve_st4};
17777 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17778 Intrinsic::aarch64_neon_st3,
17779 Intrinsic::aarch64_neon_st4};
17780 if (Scalable)
17781 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17782
17783 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17784 {STVTy, PtrTy});
17785}
17786
17787/// Lower an interleaved load into a ldN intrinsic.
17788///
17789/// E.g. Lower an interleaved load (Factor = 2):
17790/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17791/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17792/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17793///
17794/// Into:
17795/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17796/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17797/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17799 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17800 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17801 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17802 "Invalid interleave factor");
17803 assert(!Shuffles.empty() && "Empty shufflevector input");
17804 assert(Shuffles.size() == Indices.size() &&
17805 "Unmatched number of shufflevectors and indices");
17806
17807 auto *LI = dyn_cast<LoadInst>(Load);
17808 if (!LI)
17809 return false;
17810 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17811
17812 const DataLayout &DL = LI->getDataLayout();
17813
17814 VectorType *VTy = Shuffles[0]->getType();
17815
17816 // Skip if we do not have NEON and skip illegal vector types. We can
17817 // "legalize" wide vector types into multiple interleaved accesses as long as
17818 // the vector types are divisible by 128.
17819 bool UseScalable;
17820 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17821 return false;
17822
17823 // Check if the interleave is a zext(shuffle), that can be better optimized
17824 // into shift / and masks. For the moment we do this just for uitofp (not
17825 // zext) to avoid issues with widening instructions.
17826 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17827 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17828 SI->getType()->getScalarSizeInBits() * 4 ==
17829 SI->user_back()->getType()->getScalarSizeInBits();
17830 }))
17831 return false;
17832
17833 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17834
17835 auto *FVTy = cast<FixedVectorType>(VTy);
17836
17837 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17838 // load integer vectors first and then convert to pointer vectors.
17839 Type *EltTy = FVTy->getElementType();
17840 if (EltTy->isPointerTy())
17841 FVTy =
17842 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17843
17844 // If we're going to generate more than one load, reset the sub-vector type
17845 // to something legal.
17846 FVTy = FixedVectorType::get(FVTy->getElementType(),
17847 FVTy->getNumElements() / NumLoads);
17848
17849 auto *LDVTy =
17850 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17851
17852 IRBuilder<> Builder(LI);
17853
17854 // The base address of the load.
17855 Value *BaseAddr = LI->getPointerOperand();
17856
17857 Type *PtrTy = LI->getPointerOperandType();
17858 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17859 LDVTy->getElementCount());
17860
17861 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17862 UseScalable, LDVTy, PtrTy);
17863
17864 // Holds sub-vectors extracted from the load intrinsic return values. The
17865 // sub-vectors are associated with the shufflevector instructions they will
17866 // replace.
17868
17869 Value *PTrue = nullptr;
17870 if (UseScalable) {
17871 std::optional<unsigned> PgPattern =
17872 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17873 if (Subtarget->getMinSVEVectorSizeInBits() ==
17874 Subtarget->getMaxSVEVectorSizeInBits() &&
17875 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17876 PgPattern = AArch64SVEPredPattern::all;
17877
17878 auto *PTruePat =
17879 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17880 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17881 {PTruePat});
17882 }
17883
17884 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17885
17886 // If we're generating more than one load, compute the base address of
17887 // subsequent loads as an offset from the previous.
17888 if (LoadCount > 0)
17889 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17890 FVTy->getNumElements() * Factor);
17891
17892 CallInst *LdN;
17893 if (UseScalable)
17894 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17895 else
17896 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17897
17898 // Extract and store the sub-vectors returned by the load intrinsic.
17899 for (unsigned i = 0; i < Shuffles.size(); i++) {
17900 ShuffleVectorInst *SVI = Shuffles[i];
17901 unsigned Index = Indices[i];
17902
17903 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17904
17905 if (UseScalable)
17906 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17907
17908 // Convert the integer vector to pointer vector if the element is pointer.
17909 if (EltTy->isPointerTy())
17910 SubVec = Builder.CreateIntToPtr(
17912 FVTy->getNumElements()));
17913
17914 SubVecs[SVI].push_back(SubVec);
17915 }
17916 }
17917
17918 // Replace uses of the shufflevector instructions with the sub-vectors
17919 // returned by the load intrinsic. If a shufflevector instruction is
17920 // associated with more than one sub-vector, those sub-vectors will be
17921 // concatenated into a single wide vector.
17922 for (ShuffleVectorInst *SVI : Shuffles) {
17923 auto &SubVec = SubVecs[SVI];
17924 auto *WideVec =
17925 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17926 SVI->replaceAllUsesWith(WideVec);
17927 }
17928
17929 return true;
17930}
17931
17932template <typename Iter>
17933bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17934 int MaxLookupDist = 20;
17935 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17936 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17937 const Value *PtrA1 =
17938 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17939
17940 while (++It != End) {
17941 if (It->isDebugOrPseudoInst())
17942 continue;
17943 if (MaxLookupDist-- == 0)
17944 break;
17945 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17946 const Value *PtrB1 =
17947 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17948 DL, OffsetB);
17949 if (PtrA1 == PtrB1 &&
17950 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17951 .abs() == 16)
17952 return true;
17953 }
17954 }
17955
17956 return false;
17957}
17958
17959/// Lower an interleaved store into a stN intrinsic.
17960///
17961/// E.g. Lower an interleaved store (Factor = 3):
17962/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17963/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17964/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17965///
17966/// Into:
17967/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17968/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17969/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17970/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17971///
17972/// Note that the new shufflevectors will be removed and we'll only generate one
17973/// st3 instruction in CodeGen.
17974///
17975/// Example for a more general valid mask (Factor 3). Lower:
17976/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17977/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17978/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17979///
17980/// Into:
17981/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17982/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17983/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17984/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17986 Value *LaneMask,
17987 ShuffleVectorInst *SVI,
17988 unsigned Factor,
17989 const APInt &GapMask) const {
17990
17991 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17992 "Invalid interleave factor");
17993 auto *SI = dyn_cast<StoreInst>(Store);
17994 if (!SI)
17995 return false;
17996 assert(!LaneMask && GapMask.popcount() == Factor &&
17997 "Unexpected mask on store");
17998
17999 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18000 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18001
18002 unsigned LaneLen = VecTy->getNumElements() / Factor;
18003 Type *EltTy = VecTy->getElementType();
18004 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18005
18006 const DataLayout &DL = SI->getDataLayout();
18007 bool UseScalable;
18008
18009 // Skip if we do not have NEON and skip illegal vector types. We can
18010 // "legalize" wide vector types into multiple interleaved accesses as long as
18011 // the vector types are divisible by 128.
18012 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18013 return false;
18014
18015 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18016
18017 Value *Op0 = SVI->getOperand(0);
18018 Value *Op1 = SVI->getOperand(1);
18019 IRBuilder<> Builder(SI);
18020
18021 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18022 // vectors to integer vectors.
18023 if (EltTy->isPointerTy()) {
18024 Type *IntTy = DL.getIntPtrType(EltTy);
18025 unsigned NumOpElts =
18026 cast<FixedVectorType>(Op0->getType())->getNumElements();
18027
18028 // Convert to the corresponding integer vector.
18029 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18030 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18031 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18032
18033 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18034 }
18035
18036 // If we're going to generate more than one store, reset the lane length
18037 // and sub-vector type to something legal.
18038 LaneLen /= NumStores;
18039 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18040
18041 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18042 : SubVecTy;
18043
18044 // The base address of the store.
18045 Value *BaseAddr = SI->getPointerOperand();
18046
18047 auto Mask = SVI->getShuffleMask();
18048
18049 // Sanity check if all the indices are NOT in range.
18050 // If mask is `poison`, `Mask` may be a vector of -1s.
18051 // If all of them are `poison`, OOB read will happen later.
18052 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
18053 return false;
18054 }
18055 // A 64bit st2 which does not start at element 0 will involved adding extra
18056 // ext elements making the st2 unprofitable, and if there is a nearby store
18057 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18058 // zip;ldp pair which has higher throughput.
18059 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18060 (Mask[0] != 0 ||
18061 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18062 DL) ||
18063 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18064 BaseAddr, DL)))
18065 return false;
18066
18067 Type *PtrTy = SI->getPointerOperandType();
18068 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18069 STVTy->getElementCount());
18070
18071 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18072 UseScalable, STVTy, PtrTy);
18073
18074 Value *PTrue = nullptr;
18075 if (UseScalable) {
18076 std::optional<unsigned> PgPattern =
18077 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18078 if (Subtarget->getMinSVEVectorSizeInBits() ==
18079 Subtarget->getMaxSVEVectorSizeInBits() &&
18080 Subtarget->getMinSVEVectorSizeInBits() ==
18081 DL.getTypeSizeInBits(SubVecTy))
18082 PgPattern = AArch64SVEPredPattern::all;
18083
18084 auto *PTruePat =
18085 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18086 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18087 {PTruePat});
18088 }
18089
18090 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18091
18093
18094 // Split the shufflevector operands into sub vectors for the new stN call.
18095 for (unsigned i = 0; i < Factor; i++) {
18096 Value *Shuffle;
18097 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18098 if (Mask[IdxI] >= 0) {
18099 Shuffle = Builder.CreateShuffleVector(
18100 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18101 } else {
18102 unsigned StartMask = 0;
18103 for (unsigned j = 1; j < LaneLen; j++) {
18104 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18105 if (Mask[IdxJ] >= 0) {
18106 StartMask = Mask[IdxJ] - j;
18107 break;
18108 }
18109 }
18110 // Note: Filling undef gaps with random elements is ok, since
18111 // those elements were being written anyway (with undefs).
18112 // In the case of all undefs we're defaulting to using elems from 0
18113 // Note: StartMask cannot be negative, it's checked in
18114 // isReInterleaveMask
18115 Shuffle = Builder.CreateShuffleVector(
18116 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18117 }
18118
18119 if (UseScalable)
18120 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18121 Shuffle, uint64_t(0));
18122
18123 Ops.push_back(Shuffle);
18124 }
18125
18126 if (UseScalable)
18127 Ops.push_back(PTrue);
18128
18129 // If we generating more than one store, we compute the base address of
18130 // subsequent stores as an offset from the previous.
18131 if (StoreCount > 0)
18132 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18133 BaseAddr, LaneLen * Factor);
18134
18135 Ops.push_back(BaseAddr);
18136 Builder.CreateCall(StNFunc, Ops);
18137 }
18138 return true;
18139}
18140
18142 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18143 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18144 if (Factor != 2 && Factor != 4) {
18145 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
18146 return false;
18147 }
18148 auto *LI = dyn_cast<LoadInst>(Load);
18149 if (!LI)
18150 return false;
18151 assert(!Mask && "Unexpected mask on a load\n");
18152
18154
18155 const DataLayout &DL = LI->getModule()->getDataLayout();
18156 bool UseScalable;
18157 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18158 return false;
18159
18160 // TODO: Add support for using SVE instructions with fixed types later, using
18161 // the code from lowerInterleavedLoad to obtain the correct container type.
18162 if (UseScalable && !VTy->isScalableTy())
18163 return false;
18164
18165 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18166 VectorType *LdTy =
18168 VTy->getElementCount().divideCoefficientBy(NumLoads));
18169
18170 Type *PtrTy = LI->getPointerOperandType();
18171 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18172 UseScalable, LdTy, PtrTy);
18173
18174 IRBuilder<> Builder(LI);
18175 Value *Pred = nullptr;
18176 if (UseScalable)
18177 Pred =
18178 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18179
18180 Value *BaseAddr = LI->getPointerOperand();
18181 Value *Result = nullptr;
18182 if (NumLoads > 1) {
18183 // Create multiple legal small ldN.
18184 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18185 for (unsigned I = 0; I < NumLoads; ++I) {
18186 Value *Offset = Builder.getInt64(I * Factor);
18187
18188 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18189 Value *LdN = nullptr;
18190 if (UseScalable)
18191 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18192 else
18193 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18194 Value *Idx =
18195 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18196 for (unsigned J = 0; J < Factor; ++J) {
18197 ExtractedLdValues[J] = Builder.CreateInsertVector(
18198 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18199 }
18200 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18201 }
18202
18203 // Merge the values from different factors.
18204 Result = PoisonValue::get(DI->getType());
18205 for (unsigned J = 0; J < Factor; ++J)
18206 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18207 } else {
18208 if (UseScalable)
18209 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18210 else
18211 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18212 }
18213
18214 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18215 DI->replaceAllUsesWith(Result);
18216 return true;
18217}
18218
18220 Instruction *Store, Value *Mask,
18221 ArrayRef<Value *> InterleavedValues) const {
18222 unsigned Factor = InterleavedValues.size();
18223 if (Factor != 2 && Factor != 4) {
18224 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
18225 return false;
18226 }
18228 if (!SI)
18229 return false;
18230 assert(!Mask && "Unexpected mask on plain store");
18231
18232 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18233 const DataLayout &DL = SI->getModule()->getDataLayout();
18234
18235 bool UseScalable;
18236 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18237 return false;
18238
18239 // TODO: Add support for using SVE instructions with fixed types later, using
18240 // the code from lowerInterleavedStore to obtain the correct container type.
18241 if (UseScalable && !VTy->isScalableTy())
18242 return false;
18243
18244 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18245
18246 VectorType *StTy =
18248 VTy->getElementCount().divideCoefficientBy(NumStores));
18249
18250 Type *PtrTy = SI->getPointerOperandType();
18251 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18252 UseScalable, StTy, PtrTy);
18253
18254 IRBuilder<> Builder(SI);
18255
18256 Value *BaseAddr = SI->getPointerOperand();
18257 Value *Pred = nullptr;
18258
18259 if (UseScalable)
18260 Pred =
18261 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18262
18263 auto ExtractedValues = InterleavedValues;
18264 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18265 if (UseScalable)
18266 StoreOperands.push_back(Pred);
18267 StoreOperands.push_back(BaseAddr);
18268 for (unsigned I = 0; I < NumStores; ++I) {
18269 Value *Address = BaseAddr;
18270 if (NumStores > 1) {
18271 Value *Offset = Builder.getInt64(I * Factor);
18272 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18273 Value *Idx =
18274 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18275 for (unsigned J = 0; J < Factor; J++) {
18276 StoreOperands[J] =
18277 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18278 }
18279 // update the address
18280 StoreOperands[StoreOperands.size() - 1] = Address;
18281 }
18282 Builder.CreateCall(StNFunc, StoreOperands);
18283 }
18284 return true;
18285}
18286
18288 LLVMContext &Context, const MemOp &Op,
18289 const AttributeList &FuncAttributes) const {
18290 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18291 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18292 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18293 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18294 // taken one instruction to materialize the v2i64 zero and one store (with
18295 // restrictive addressing mode). Just do i64 stores.
18296 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18297 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18298 if (Op.isAligned(AlignCheck))
18299 return true;
18300 unsigned Fast;
18301 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18303 Fast;
18304 };
18305
18306 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18307 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18308 return MVT::v16i8;
18309 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18310 return MVT::f128;
18311 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18312 return MVT::i64;
18313 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18314 return MVT::i32;
18315 return MVT::Other;
18316}
18317
18319 const MemOp &Op, const AttributeList &FuncAttributes) const {
18320 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18321 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18322 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18323 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18324 // taken one instruction to materialize the v2i64 zero and one store (with
18325 // restrictive addressing mode). Just do i64 stores.
18326 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18327 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18328 if (Op.isAligned(AlignCheck))
18329 return true;
18330 unsigned Fast;
18331 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18333 Fast;
18334 };
18335
18336 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18337 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18338 return LLT::fixed_vector(2, 64);
18339 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18340 return LLT::scalar(128);
18341 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18342 return LLT::scalar(64);
18343 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18344 return LLT::scalar(32);
18345 return LLT();
18346}
18347
18348// 12-bit optionally shifted immediates are legal for adds.
18350 if (Immed == std::numeric_limits<int64_t>::min()) {
18351 return false;
18352 }
18353 // Same encoding for add/sub, just flip the sign.
18354 return isLegalArithImmed((uint64_t)std::abs(Immed));
18355}
18356
18358 // We will only emit addvl/inc* instructions for SVE2
18359 if (!Subtarget->hasSVE2())
18360 return false;
18361
18362 // addvl's immediates are in terms of the number of bytes in a register.
18363 // Since there are 16 in the base supported size (128bits), we need to
18364 // divide the immediate by that much to give us a useful immediate to
18365 // multiply by vscale. We can't have a remainder as a result of this.
18366 if (Imm % 16 == 0)
18367 return isInt<6>(Imm / 16);
18368
18369 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18370 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18371 // of addvl as a result, so only take h|w|d into account.
18372 // Dec[h|w|d] will cover subtractions.
18373 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18374 // FIXME: Can we make use of other patterns to cover other immediates?
18375
18376 // inch|dech
18377 if (Imm % 8 == 0)
18378 return std::abs(Imm / 8) <= 16;
18379 // incw|decw
18380 if (Imm % 4 == 0)
18381 return std::abs(Imm / 4) <= 16;
18382 // incd|decd
18383 if (Imm % 2 == 0)
18384 return std::abs(Imm / 2) <= 16;
18385
18386 return false;
18387}
18388
18389// Return false to prevent folding
18390// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18391// if the folding leads to worse code.
18393 SDValue AddNode, SDValue ConstNode) const {
18394 // Let the DAGCombiner decide for vector types and large types.
18395 const EVT VT = AddNode.getValueType();
18396 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18397 return true;
18398
18399 // It is worse if c1 is legal add immediate, while c1*c2 is not
18400 // and has to be composed by at least two instructions.
18401 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18402 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18403 const int64_t C1 = C1Node->getSExtValue();
18404 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18406 return true;
18408 // Adapt to the width of a register.
18409 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18410 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18411 if (Insn.size() > 1)
18412 return false;
18413
18414 // Default to true and let the DAGCombiner decide.
18415 return true;
18416}
18417
18418// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18419// immediates is the same as for an add or a sub.
18421 return isLegalAddImmediate(Immed);
18422}
18423
18424/// isLegalAddressingMode - Return true if the addressing mode represented
18425/// by AM is legal for this target, for a load/store of the specified type.
18427 const AddrMode &AMode, Type *Ty,
18428 unsigned AS, Instruction *I) const {
18429 // AArch64 has five basic addressing modes:
18430 // reg
18431 // reg + 9-bit signed offset
18432 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18433 // reg1 + reg2
18434 // reg + SIZE_IN_BYTES * reg
18435
18436 // No global is ever allowed as a base.
18437 if (AMode.BaseGV)
18438 return false;
18439
18440 // No reg+reg+imm addressing.
18441 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18442 return false;
18443
18444 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18445 // `2*ScaledReg` into `BaseReg + ScaledReg`
18446 AddrMode AM = AMode;
18447 if (AM.Scale && !AM.HasBaseReg) {
18448 if (AM.Scale == 1) {
18449 AM.HasBaseReg = true;
18450 AM.Scale = 0;
18451 } else if (AM.Scale == 2) {
18452 AM.HasBaseReg = true;
18453 AM.Scale = 1;
18454 } else {
18455 return false;
18456 }
18457 }
18458
18459 // A base register is required in all addressing modes.
18460 if (!AM.HasBaseReg)
18461 return false;
18462
18463 if (Ty->isScalableTy()) {
18464 if (isa<ScalableVectorType>(Ty)) {
18465 // See if we have a foldable vscale-based offset, for vector types which
18466 // are either legal or smaller than the minimum; more work will be
18467 // required if we need to consider addressing for types which need
18468 // legalization by splitting.
18469 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18470 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18471 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18472 isPowerOf2_64(VecNumBytes))
18473 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18474
18475 uint64_t VecElemNumBytes =
18476 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18477 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18478 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18479 }
18480
18481 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18482 }
18483
18484 // No scalable offsets allowed for non-scalable types.
18485 if (AM.ScalableOffset)
18486 return false;
18487
18488 // check reg + imm case:
18489 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18490 uint64_t NumBytes = 0;
18491 if (Ty->isSized()) {
18492 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18493 NumBytes = NumBits / 8;
18494 if (!isPowerOf2_64(NumBits))
18495 NumBytes = 0;
18496 }
18497
18498 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18499 AM.Scale);
18500}
18501
18502// Check whether the 2 offsets belong to the same imm24 range, and their high
18503// 12bits are same, then their high part can be decoded with the offset of add.
18504int64_t
18506 int64_t MaxOffset) const {
18507 int64_t HighPart = MinOffset & ~0xfffULL;
18508 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18509 // Rebase the value to an integer multiple of imm12.
18510 return HighPart;
18511 }
18512
18513 return 0;
18514}
18515
18517 // Consider splitting large offset of struct or array.
18518 return true;
18519}
18520
18522 const MachineFunction &MF, EVT VT) const {
18523 EVT ScalarVT = VT.getScalarType();
18524
18525 if (!ScalarVT.isSimple())
18526 return false;
18527
18528 switch (ScalarVT.getSimpleVT().SimpleTy) {
18529 case MVT::f16:
18530 return Subtarget->hasFullFP16();
18531 case MVT::f32:
18532 case MVT::f64:
18533 return true;
18534 case MVT::bf16:
18535 return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18536 Subtarget->isNonStreamingSVEorSME2Available();
18537 default:
18538 break;
18539 }
18540
18541 return false;
18542}
18543
18545 Type *Ty) const {
18546 switch (Ty->getScalarType()->getTypeID()) {
18547 case Type::FloatTyID:
18548 case Type::DoubleTyID:
18549 return true;
18550 default:
18551 return false;
18552 }
18553}
18554
18556 EVT VT, CodeGenOptLevel OptLevel) const {
18557 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18559}
18560
18561const MCPhysReg *
18563 // LR is a callee-save register, but we must treat it as clobbered by any call
18564 // site. Hence we include LR in the scratch registers, which are in turn added
18565 // as implicit-defs for stackmaps and patchpoints.
18566 static const MCPhysReg ScratchRegs[] = {
18567 AArch64::X16, AArch64::X17, AArch64::LR, 0
18568 };
18569 return ScratchRegs;
18570}
18571
18573 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18574 return RCRegs;
18575}
18576
18577bool
18579 CombineLevel Level) const {
18580 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18581 N->getOpcode() == ISD::SRL) &&
18582 "Expected shift op");
18583
18584 SDValue ShiftLHS = N->getOperand(0);
18585 EVT VT = N->getValueType(0);
18586
18587 if (!ShiftLHS->hasOneUse())
18588 return false;
18589
18590 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18591 !ShiftLHS.getOperand(0)->hasOneUse())
18592 return false;
18593
18594 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18595 // combine it with shift 'N' to let it be lowered to UBFX except:
18596 // ((x >> C) & mask) << C.
18597 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18598 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18599 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18600 if (isMask_64(TruncMask)) {
18601 SDValue AndLHS = ShiftLHS.getOperand(0);
18602 if (AndLHS.getOpcode() == ISD::SRL) {
18603 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18604 if (N->getOpcode() == ISD::SHL)
18605 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18606 return SRLC->getZExtValue() == SHLC->getZExtValue();
18607 return false;
18608 }
18609 }
18610 }
18611 }
18612 return true;
18613}
18614
18616 const SDNode *N) const {
18617 assert(N->getOpcode() == ISD::XOR &&
18618 (N->getOperand(0).getOpcode() == ISD::SHL ||
18619 N->getOperand(0).getOpcode() == ISD::SRL) &&
18620 "Expected XOR(SHIFT) pattern");
18621
18622 // Only commute if the entire NOT mask is a hidden shifted mask.
18623 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18624 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18625 if (XorC && ShiftC) {
18626 unsigned MaskIdx, MaskLen;
18627 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18628 unsigned ShiftAmt = ShiftC->getZExtValue();
18629 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18630 if (N->getOperand(0).getOpcode() == ISD::SHL)
18631 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18632 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18633 }
18634 }
18635
18636 return false;
18637}
18638
18640 const SDNode *N, CombineLevel Level) const {
18641 assert(((N->getOpcode() == ISD::SHL &&
18642 N->getOperand(0).getOpcode() == ISD::SRL) ||
18643 (N->getOpcode() == ISD::SRL &&
18644 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18645 "Expected shift-shift mask");
18646 // Don't allow multiuse shift folding with the same shift amount.
18647 if (!N->getOperand(0)->hasOneUse())
18648 return false;
18649
18650 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18651 EVT VT = N->getValueType(0);
18652 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18653 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18654 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18655 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18656 }
18657
18658 // We do not need to fold when this shifting used in specific load case:
18659 // (ldr x, (add x, (shl (srl x, c1) 2)))
18660 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18661 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18662 unsigned ShlAmt = C2->getZExtValue();
18663 if (auto ShouldADD = *N->user_begin();
18664 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18665 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18666 EVT MemVT = Load->getMemoryVT();
18667
18668 if (Load->getValueType(0).isScalableVector())
18669 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18670
18671 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18672 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18673 }
18674 }
18675 }
18676 }
18677
18678 return true;
18679}
18680
18682 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18683 SDValue Y) const {
18684 return VT.isScalableVector() && isTypeLegal(VT) &&
18685 SelectOpcode == ISD::VSELECT;
18686}
18687
18689 Type *Ty) const {
18690 assert(Ty->isIntegerTy());
18691
18692 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18693 if (BitSize == 0)
18694 return false;
18695
18696 int64_t Val = Imm.getSExtValue();
18697 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18698 return true;
18699
18700 if (Val < 0)
18701 Val = ~Val;
18702 if (BitSize == 32)
18703 Val &= (1LL << 32) - 1;
18704
18705 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18706 // MOVZ is free so return true for one or fewer MOVK.
18707 return Shift < 3;
18708}
18709
18711 unsigned Index) const {
18713 return false;
18714
18715 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18716}
18717
18718/// Turn vector tests of the signbit in the form of:
18719/// xor (sra X, elt_size(X)-1), -1
18720/// into:
18721/// cmge X, X, #0
18723 const AArch64Subtarget *Subtarget) {
18724 EVT VT = N->getValueType(0);
18725 if (!Subtarget->hasNEON() || !VT.isVector())
18726 return SDValue();
18727
18728 // There must be a shift right algebraic before the xor, and the xor must be a
18729 // 'not' operation.
18730 SDValue Shift = N->getOperand(0);
18731 SDValue Ones = N->getOperand(1);
18732 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18734 return SDValue();
18735
18736 // The shift should be smearing the sign bit across each vector element.
18737 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18738 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18739 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18740 return SDValue();
18741
18742 SDLoc DL(N);
18743 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18744 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18745}
18746
18747// Given a vecreduce_add node, detect the below pattern and convert it to the
18748// node sequence with UABDL, [S|U]ADB and UADDLP.
18749//
18750// i32 vecreduce_add(
18751// v16i32 abs(
18752// v16i32 sub(
18753// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18754//
18755// or
18756//
18757// i32 vecreduce_add(
18758// v16i32 zext(
18759// v16i16 abs(
18760// v16i16 sub(
18761// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18762//
18763// =================>
18764// i32 vecreduce_add(
18765// v4i32 UADDLP(
18766// v8i16 add(
18767// v8i16 zext(
18768// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18769// v8i16 zext(
18770// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18772 SelectionDAG &DAG) {
18773 // Assumed i32 vecreduce_add
18774 if (N->getValueType(0) != MVT::i32)
18775 return SDValue();
18776
18777 SDValue VecReduceOp0 = N->getOperand(0);
18778 bool SawTrailingZext = false;
18779 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18780 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18781 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18782 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18783 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18784 SawTrailingZext = true;
18785 VecReduceOp0 = VecReduceOp0.getOperand(0);
18786 }
18787
18788 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18789 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18790 // Assumed v16i16 or v16i32 abs input
18791 unsigned Opcode = VecReduceOp0.getOpcode();
18792 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18793 return SDValue();
18794
18795 SDValue ABS = VecReduceOp0;
18796 // Assumed v16i16 or v16i32 sub
18797 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18798 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18799 return SDValue();
18800
18801 SDValue SUB = ABS->getOperand(0);
18802 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18803 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18804 // Assumed v16i16 or v16i32 type
18805 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18806 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18807 return SDValue();
18808
18809 // Assumed zext or sext
18810 bool IsZExt = false;
18811 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18812 IsZExt = true;
18813 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18814 IsZExt = false;
18815 } else
18816 return SDValue();
18817
18818 SDValue EXT0 = SUB->getOperand(0);
18819 SDValue EXT1 = SUB->getOperand(1);
18820 // Assumed zext's operand has v16i8 type
18821 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18822 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18823 return SDValue();
18824
18825 // Pattern is detected. Let's convert it to sequence of nodes.
18826 SDLoc DL(N);
18827
18828 // First, create the node pattern of UABD/SABD.
18829 SDValue UABDHigh8Op0 =
18830 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18831 DAG.getConstant(8, DL, MVT::i64));
18832 SDValue UABDHigh8Op1 =
18833 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18834 DAG.getConstant(8, DL, MVT::i64));
18835 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18836 UABDHigh8Op0, UABDHigh8Op1);
18837 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18838
18839 // Second, create the node pattern of UABAL.
18840 SDValue UABDLo8Op0 =
18841 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18842 DAG.getConstant(0, DL, MVT::i64));
18843 SDValue UABDLo8Op1 =
18844 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18845 DAG.getConstant(0, DL, MVT::i64));
18846 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18847 UABDLo8Op0, UABDLo8Op1);
18848 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18849 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18850
18851 // Third, create the node of UADDLP.
18852 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18853
18854 // Fourth, create the node of VECREDUCE_ADD.
18855 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18856}
18857
18858static SDValue
18860 const AArch64Subtarget *ST) {
18861 if (DCI.isBeforeLegalize())
18862 return SDValue();
18863
18864 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18865 /*IsEqual=*/false))
18866 return While;
18867
18868 if (!N->getValueType(0).isScalableVector() ||
18869 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18870 return SDValue();
18871
18872 // Count the number of users which are extract_vectors.
18873 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
18874 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
18875 });
18876
18877 auto MaskEC = N->getValueType(0).getVectorElementCount();
18878 if (!MaskEC.isKnownMultipleOf(NumExts))
18879 return SDValue();
18880
18881 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
18882 if (ExtMinEC.getKnownMinValue() < 2)
18883 return SDValue();
18884
18885 SmallVector<SDNode *> Extracts(NumExts, nullptr);
18886 for (SDNode *Use : N->users()) {
18887 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18888 continue;
18889
18890 // Ensure the extract type is correct (e.g. if NumExts is 4 and
18891 // the mask return type is nxv8i1, each extract should be nxv2i1.
18892 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18893 return SDValue();
18894
18895 // There should be exactly one extract for each part of the mask.
18896 unsigned Offset = Use->getConstantOperandVal(1);
18897 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18898 if (Extracts[Part] != nullptr)
18899 return SDValue();
18900
18901 Extracts[Part] = Use;
18902 }
18903
18904 SelectionDAG &DAG = DCI.DAG;
18905 SDLoc DL(N);
18906 SDValue ID =
18907 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18908
18909 SDValue Idx = N->getOperand(0);
18910 SDValue TC = N->getOperand(1);
18911 if (Idx.getValueType() != MVT::i64) {
18912 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18913 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18914 }
18915
18916 // Create the whilelo_x2 intrinsics from each pair of extracts
18917 EVT ExtVT = Extracts[0]->getValueType(0);
18918 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
18919 auto R =
18920 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18921 DCI.CombineTo(Extracts[0], R.getValue(0));
18922 DCI.CombineTo(Extracts[1], R.getValue(1));
18923 SmallVector<SDValue> Concats = {DAG.getNode(
18924 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
18925
18926 if (NumExts == 2) {
18927 assert(N->getValueType(0) == DoubleExtVT);
18928 return Concats[0];
18929 }
18930
18931 auto Elts =
18932 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
18933 for (unsigned I = 2; I < NumExts; I += 2) {
18934 // After the first whilelo_x2, we need to increment the starting value.
18935 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
18936 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18937 DCI.CombineTo(Extracts[I], R.getValue(0));
18938 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18939 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
18940 R.getValue(0), R.getValue(1)));
18941 }
18942
18943 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
18944}
18945
18946// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18947// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18948// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18949// If we have vectors larger than v16i8 we extract v16i8 vectors,
18950// Follow the same steps above to get DOT instructions concatenate them
18951// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18953 const AArch64Subtarget *ST) {
18954 if (!ST->isNeonAvailable())
18955 return SDValue();
18956
18957 if (!ST->hasDotProd())
18959
18960 SDValue Op0 = N->getOperand(0);
18961 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18962 Op0.getValueType().getVectorElementType() != MVT::i32)
18963 return SDValue();
18964
18965 unsigned ExtOpcode = Op0.getOpcode();
18966 SDValue A = Op0;
18967 SDValue B;
18968 unsigned DotOpcode;
18969 if (ExtOpcode == ISD::MUL) {
18970 A = Op0.getOperand(0);
18971 B = Op0.getOperand(1);
18972 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18973 return SDValue();
18974 auto OpCodeA = A.getOpcode();
18975 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18976 return SDValue();
18977
18978 auto OpCodeB = B.getOpcode();
18979 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18980 return SDValue();
18981
18982 if (OpCodeA == OpCodeB) {
18983 DotOpcode =
18984 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18985 } else {
18986 // Check USDOT support support
18987 if (!ST->hasMatMulInt8())
18988 return SDValue();
18989 DotOpcode = AArch64ISD::USDOT;
18990 if (OpCodeA == ISD::SIGN_EXTEND)
18991 std::swap(A, B);
18992 }
18993 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18994 DotOpcode = AArch64ISD::UDOT;
18995 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18996 DotOpcode = AArch64ISD::SDOT;
18997 } else {
18998 return SDValue();
18999 }
19000
19001 EVT Op0VT = A.getOperand(0).getValueType();
19002 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19003 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19004 if (!IsValidElementCount || !IsValidSize)
19005 return SDValue();
19006
19007 SDLoc DL(Op0);
19008 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19009 // the extend B.
19010 if (!B)
19011 B = DAG.getConstant(1, DL, Op0VT);
19012 else
19013 B = B.getOperand(0);
19014
19015 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19016 unsigned NumOfVecReduce;
19017 EVT TargetType;
19018 if (IsMultipleOf16) {
19019 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19020 TargetType = MVT::v4i32;
19021 } else {
19022 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19023 TargetType = MVT::v2i32;
19024 }
19025 // Handle the case where we need to generate only one Dot operation.
19026 if (NumOfVecReduce == 1) {
19027 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19028 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19029 A.getOperand(0), B);
19030 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19031 }
19032 // Generate Dot instructions that are multiple of 16.
19033 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19034 SmallVector<SDValue, 4> SDotVec16;
19035 unsigned I = 0;
19036 for (; I < VecReduce16Num; I += 1) {
19037 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19038 SDValue Op0 =
19039 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19040 DAG.getConstant(I * 16, DL, MVT::i64));
19041 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19042 DAG.getConstant(I * 16, DL, MVT::i64));
19043 SDValue Dot =
19044 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19045 SDotVec16.push_back(Dot);
19046 }
19047 // Concatenate dot operations.
19048 EVT SDot16EVT =
19049 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19050 SDValue ConcatSDot16 =
19051 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19052 SDValue VecReduceAdd16 =
19053 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19054 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19055 if (VecReduce8Num == 0)
19056 return VecReduceAdd16;
19057
19058 // Generate the remainder Dot operation that is multiple of 8.
19059 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19060 SDValue Vec8Op0 =
19061 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19062 DAG.getConstant(I * 16, DL, MVT::i64));
19063 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19064 DAG.getConstant(I * 16, DL, MVT::i64));
19065 SDValue Dot =
19066 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19067 SDValue VecReduceAdd8 =
19068 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19069 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19070 VecReduceAdd8);
19071}
19072
19073// Given an (integer) vecreduce, we know the order of the inputs does not
19074// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19075// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19076// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19078 auto DetectAddExtract = [&](SDValue A) {
19079 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19080 // UADDLP(x) if found.
19081 assert(A.getOpcode() == ISD::ADD);
19082 EVT VT = A.getValueType();
19083 SDValue Op0 = A.getOperand(0);
19084 SDValue Op1 = A.getOperand(1);
19085 if (Op0.getOpcode() != Op1.getOpcode() ||
19086 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19087 Op0.getOpcode() != ISD::SIGN_EXTEND))
19088 return SDValue();
19089 SDValue Ext0 = Op0.getOperand(0);
19090 SDValue Ext1 = Op1.getOperand(0);
19091 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19093 Ext0.getOperand(0) != Ext1.getOperand(0))
19094 return SDValue();
19095 // Check that the type is twice the add types, and the extract are from
19096 // upper/lower parts of the same source.
19098 VT.getVectorNumElements() * 2)
19099 return SDValue();
19100 if ((Ext0.getConstantOperandVal(1) != 0 ||
19102 (Ext1.getConstantOperandVal(1) != 0 ||
19104 return SDValue();
19105 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19106 : AArch64ISD::SADDLP;
19107 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19108 };
19109
19110 if (SDValue R = DetectAddExtract(A))
19111 return R;
19112
19113 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19114 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19115 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19116 A.getOperand(1));
19117 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19118 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19119 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19120 A.getOperand(0));
19121 return SDValue();
19122}
19123
19124// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19125// UADDLV(concat), where the concat represents the 64-bit zext sources.
19127 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19128 // UADDLV(concat(zext, zext)) if found.
19129 assert(A.getOpcode() == ISD::ADD);
19130 EVT VT = A.getValueType();
19131 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19132 return SDValue();
19133 SDValue Op0 = A.getOperand(0);
19134 SDValue Op1 = A.getOperand(1);
19135 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19136 return SDValue();
19137 SDValue Ext0 = Op0.getOperand(0);
19138 SDValue Ext1 = Op1.getOperand(0);
19139 EVT ExtVT0 = Ext0.getValueType();
19140 EVT ExtVT1 = Ext1.getValueType();
19141 // Check zext VTs are the same and 64-bit length.
19142 if (ExtVT0 != ExtVT1 ||
19143 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19144 return SDValue();
19145 // Get VT for concat of zext sources.
19146 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19147 SDValue Concat =
19148 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19149
19150 switch (VT.getSimpleVT().SimpleTy) {
19151 case MVT::v2i64:
19152 case MVT::v4i32:
19153 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19154 case MVT::v8i16: {
19155 SDValue Uaddlv =
19156 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19157 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19158 }
19159 default:
19160 llvm_unreachable("Unhandled vector type");
19161 }
19162}
19163
19165 SDValue A = N->getOperand(0);
19166 if (A.getOpcode() == ISD::ADD) {
19167 if (SDValue R = performUADDVAddCombine(A, DAG))
19168 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19169 else if (SDValue R = performUADDVZextCombine(A, DAG))
19170 return R;
19171 }
19172
19173 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19174 MVT OpVT = A.getSimpleValueType();
19175 assert(N->getSimpleValueType(0) == OpVT &&
19176 "The operand type should be consistent with the result type of UADDV");
19178 Mask.clearBit(0);
19179 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19180 if (KnownLeadingLanes.isZero())
19181 return A;
19182
19183 return SDValue();
19184}
19185
19188 const AArch64Subtarget *Subtarget) {
19189 if (DCI.isBeforeLegalizeOps())
19190 return SDValue();
19191
19192 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19193}
19194
19195SDValue
19196AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19197 SelectionDAG &DAG,
19198 SmallVectorImpl<SDNode *> &Created) const {
19199 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19200 if (isIntDivCheap(N->getValueType(0), Attr))
19201 return SDValue(N, 0); // Lower SDIV as SDIV
19202
19203 EVT VT = N->getValueType(0);
19204
19205 // If SVE is available, we can generate
19206 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19207 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19208 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19209 return SDValue(N, 0);
19210
19211 // fold (sdiv X, pow2)
19212 if ((VT != MVT::i32 && VT != MVT::i64) ||
19213 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19214 return SDValue();
19215
19216 // If the divisor is 2 or -2, the default expansion is better. It will add
19217 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19218 if (Divisor == 2 ||
19219 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19220 return SDValue();
19221
19222 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19223}
19224
19225SDValue
19226AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19227 SelectionDAG &DAG,
19228 SmallVectorImpl<SDNode *> &Created) const {
19229 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19230 if (isIntDivCheap(N->getValueType(0), Attr))
19231 return SDValue(N, 0); // Lower SREM as SREM
19232
19233 EVT VT = N->getValueType(0);
19234
19235 // For scalable and fixed types, mark them as cheap so we can handle it much
19236 // later. This allows us to handle larger than legal types.
19237 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19238 return SDValue(N, 0);
19239
19240 // fold (srem X, pow2)
19241 if ((VT != MVT::i32 && VT != MVT::i64) ||
19242 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19243 return SDValue();
19244
19245 unsigned Lg2 = Divisor.countr_zero();
19246 if (Lg2 == 0)
19247 return SDValue();
19248
19249 SDLoc DL(N);
19250 SDValue N0 = N->getOperand(0);
19251 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19252 SDValue Zero = DAG.getConstant(0, DL, VT);
19253 SDValue CCVal, CSNeg;
19254 if (Lg2 == 1) {
19255 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19256 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19257 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19258
19259 Created.push_back(Cmp.getNode());
19260 Created.push_back(And.getNode());
19261 } else {
19262 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19263 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19264
19265 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19266 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19267 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19268 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19269 Negs.getValue(1));
19270
19271 Created.push_back(Negs.getNode());
19272 Created.push_back(AndPos.getNode());
19273 Created.push_back(AndNeg.getNode());
19274 }
19275
19276 return CSNeg;
19277}
19278
19279static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19280 switch(getIntrinsicID(S.getNode())) {
19281 default:
19282 break;
19283 case Intrinsic::aarch64_sve_cntb:
19284 return 8;
19285 case Intrinsic::aarch64_sve_cnth:
19286 return 16;
19287 case Intrinsic::aarch64_sve_cntw:
19288 return 32;
19289 case Intrinsic::aarch64_sve_cntd:
19290 return 64;
19291 }
19292 return {};
19293}
19294
19295/// Calculates what the pre-extend type is, based on the extension
19296/// operation node provided by \p Extend.
19297///
19298/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19299/// pre-extend type is pulled directly from the operand, while other extend
19300/// operations need a bit more inspection to get this information.
19301///
19302/// \param Extend The SDNode from the DAG that represents the extend operation
19303///
19304/// \returns The type representing the \p Extend source type, or \p MVT::Other
19305/// if no valid type can be determined
19307 switch (Extend.getOpcode()) {
19308 case ISD::SIGN_EXTEND:
19309 case ISD::ZERO_EXTEND:
19310 case ISD::ANY_EXTEND:
19311 return Extend.getOperand(0).getValueType();
19312 case ISD::AssertSext:
19313 case ISD::AssertZext:
19315 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19316 if (!TypeNode)
19317 return MVT::Other;
19318 return TypeNode->getVT();
19319 }
19320 case ISD::AND: {
19323 if (!Constant)
19324 return MVT::Other;
19325
19326 uint32_t Mask = Constant->getZExtValue();
19327
19328 if (Mask == UCHAR_MAX)
19329 return MVT::i8;
19330 else if (Mask == USHRT_MAX)
19331 return MVT::i16;
19332 else if (Mask == UINT_MAX)
19333 return MVT::i32;
19334
19335 return MVT::Other;
19336 }
19337 default:
19338 return MVT::Other;
19339 }
19340}
19341
19342/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19343/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19344/// SExt/ZExt rather than the scalar SExt/ZExt
19346 EVT VT = BV.getValueType();
19347 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19349 return SDValue();
19350
19351 // Use the first item in the buildvector/shuffle to get the size of the
19352 // extend, and make sure it looks valid.
19353 SDValue Extend = BV->getOperand(0);
19354 unsigned ExtendOpcode = Extend.getOpcode();
19355 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19356 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19357 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19358 ExtendOpcode == ISD::AssertSext;
19359 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19360 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19361 return SDValue();
19362 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19363 // ensure calculatePreExtendType will work without issue.
19364 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19365 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19366 return SDValue();
19367
19368 // Restrict valid pre-extend data type
19369 EVT PreExtendType = calculatePreExtendType(Extend);
19370 if (PreExtendType == MVT::Other ||
19371 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19372 return SDValue();
19373
19374 // Make sure all other operands are equally extended.
19375 bool SeenZExtOrSExt = !IsAnyExt;
19376 for (SDValue Op : drop_begin(BV->ops())) {
19377 if (Op.isUndef())
19378 continue;
19379
19380 if (calculatePreExtendType(Op) != PreExtendType)
19381 return SDValue();
19382
19383 unsigned Opc = Op.getOpcode();
19384 if (Opc == ISD::ANY_EXTEND)
19385 continue;
19386
19387 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19389
19390 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19391 return SDValue();
19392
19393 IsSExt = OpcIsSExt;
19394 SeenZExtOrSExt = true;
19395 }
19396
19397 SDValue NBV;
19398 SDLoc DL(BV);
19399 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19400 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19401 EVT PreExtendLegalType =
19402 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19404 for (SDValue Op : BV->ops())
19405 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19406 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19407 PreExtendLegalType));
19408 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19409 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19410 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19411 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19412 BV.getOperand(1).isUndef()
19413 ? DAG.getUNDEF(PreExtendVT)
19414 : BV.getOperand(1).getOperand(0),
19415 cast<ShuffleVectorSDNode>(BV)->getMask());
19416 }
19417 unsigned ExtOpc = !SeenZExtOrSExt
19419 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19420 return DAG.getNode(ExtOpc, DL, VT, NBV);
19421}
19422
19423/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19424/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19426 // If the value type isn't a vector, none of the operands are going to be dups
19427 EVT VT = Mul->getValueType(0);
19428 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19429 return SDValue();
19430
19431 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19432 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19433
19434 // Neither operands have been changed, don't make any further changes
19435 if (!Op0 && !Op1)
19436 return SDValue();
19437
19438 SDLoc DL(Mul);
19439 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19440 Op1 ? Op1 : Mul->getOperand(1));
19441}
19442
19443// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19444// Same for other types with equivalent constants.
19446 EVT VT = N->getValueType(0);
19447 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19448 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19449 return SDValue();
19450 if (N->getOperand(0).getOpcode() != ISD::AND ||
19451 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19452 return SDValue();
19453
19454 SDValue And = N->getOperand(0);
19455 SDValue Srl = And.getOperand(0);
19456
19457 APInt V1, V2, V3;
19458 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19459 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19461 return SDValue();
19462
19463 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19464 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19465 V3 != (HalfSize - 1))
19466 return SDValue();
19467
19468 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19469 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19470 VT.getVectorElementCount() * 2);
19471
19472 SDLoc DL(N);
19473 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19474 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19475 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19476 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19477}
19478
19479// Transform vector add(zext i8 to i32, zext i8 to i32)
19480// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19481// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19482// extends.
19484 EVT VT = N->getValueType(0);
19485 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19486 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19487 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19488 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19489 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19490 N->getOperand(0).getOperand(0).getValueType() !=
19491 N->getOperand(1).getOperand(0).getValueType())
19492 return SDValue();
19493
19494 if (N->getOpcode() == ISD::MUL &&
19495 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19496 return SDValue();
19497
19498 SDValue N0 = N->getOperand(0).getOperand(0);
19499 SDValue N1 = N->getOperand(1).getOperand(0);
19500 EVT InVT = N0.getValueType();
19501
19502 EVT S1 = InVT.getScalarType();
19503 EVT S2 = VT.getScalarType();
19504 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19505 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19506 SDLoc DL(N);
19507 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19510 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19511 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19512 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19513 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19514 : (unsigned)ISD::SIGN_EXTEND,
19515 DL, VT, NewOp);
19516 }
19517 return SDValue();
19518}
19519
19522 const AArch64Subtarget *Subtarget) {
19523
19524 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19525 return Ext;
19527 return Ext;
19528 if (SDValue Ext = performVectorExtCombine(N, DAG))
19529 return Ext;
19530
19531 if (DCI.isBeforeLegalizeOps())
19532 return SDValue();
19533
19534 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19535 // and in MachineCombiner pass, add+mul will be combined into madd.
19536 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19537 SDLoc DL(N);
19538 EVT VT = N->getValueType(0);
19539 SDValue N0 = N->getOperand(0);
19540 SDValue N1 = N->getOperand(1);
19541 SDValue MulOper;
19542 unsigned AddSubOpc;
19543
19544 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19545 AddSubOpc = V->getOpcode();
19546 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19547 SDValue Opnd = V->getOperand(1);
19548 MulOper = V->getOperand(0);
19549 if (AddSubOpc == ISD::SUB)
19550 std::swap(Opnd, MulOper);
19551 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19552 return C->isOne();
19553 }
19554 return false;
19555 };
19556
19557 if (IsAddSubWith1(N0)) {
19558 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19559 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19560 }
19561
19562 if (IsAddSubWith1(N1)) {
19563 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19564 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19565 }
19566
19567 // The below optimizations require a constant RHS.
19568 if (!isa<ConstantSDNode>(N1))
19569 return SDValue();
19570
19572 const APInt &ConstValue = C->getAPIntValue();
19573
19574 // Allow the scaling to be folded into the `cnt` instruction by preventing
19575 // the scaling to be obscured here. This makes it easier to pattern match.
19576 if (IsSVECntIntrinsic(N0) ||
19577 (N0->getOpcode() == ISD::TRUNCATE &&
19578 (IsSVECntIntrinsic(N0->getOperand(0)))))
19579 if (ConstValue.sge(1) && ConstValue.sle(16))
19580 return SDValue();
19581
19582 // Multiplication of a power of two plus/minus one can be done more
19583 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19584 // future CPUs have a cheaper MADD instruction, this may need to be
19585 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19586 // 64-bit is 5 cycles, so this is always a win.
19587 // More aggressively, some multiplications N0 * C can be lowered to
19588 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19589 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19590 // TODO: lower more cases.
19591
19592 // TrailingZeroes is used to test if the mul can be lowered to
19593 // shift+add+shift.
19594 unsigned TrailingZeroes = ConstValue.countr_zero();
19595 if (TrailingZeroes) {
19596 // Conservatively do not lower to shift+add+shift if the mul might be
19597 // folded into smul or umul.
19598 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19599 isZeroExtended(N0, DAG)))
19600 return SDValue();
19601 // Conservatively do not lower to shift+add+shift if the mul might be
19602 // folded into madd or msub.
19603 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19604 N->user_begin()->getOpcode() == ISD::SUB))
19605 return SDValue();
19606 }
19607 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19608 // and shift+add+shift.
19609 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19610 unsigned ShiftAmt;
19611
19612 auto Shl = [&](SDValue N0, unsigned N1) {
19613 if (!N0.getNode())
19614 return SDValue();
19615 // If shift causes overflow, ignore this combine.
19616 if (N1 >= N0.getValueSizeInBits())
19617 return SDValue();
19618 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19619 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19620 };
19621 auto Add = [&](SDValue N0, SDValue N1) {
19622 if (!N0.getNode() || !N1.getNode())
19623 return SDValue();
19624 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19625 };
19626 auto Sub = [&](SDValue N0, SDValue N1) {
19627 if (!N0.getNode() || !N1.getNode())
19628 return SDValue();
19629 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19630 };
19631 auto Negate = [&](SDValue N) {
19632 if (!N0.getNode())
19633 return SDValue();
19634 SDValue Zero = DAG.getConstant(0, DL, VT);
19635 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19636 };
19637
19638 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19639 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19640 // the (2^N - 1) can't be execused via a single instruction.
19641 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19642 unsigned BitWidth = C.getBitWidth();
19643 for (unsigned i = 1; i < BitWidth / 2; i++) {
19644 APInt Rem;
19645 APInt X(BitWidth, (1 << i) + 1);
19646 APInt::sdivrem(C, X, N, Rem);
19647 APInt NVMinus1 = N - 1;
19648 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19649 M = X;
19650 return true;
19651 }
19652 }
19653 return false;
19654 };
19655
19656 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19657 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19658 // the (2^N - 1) can't be execused via a single instruction.
19659 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19660 APInt CVMinus1 = C - 1;
19661 if (CVMinus1.isNegative())
19662 return false;
19663 unsigned TrailingZeroes = CVMinus1.countr_zero();
19664 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19665 if (SCVMinus1.isPowerOf2()) {
19666 unsigned BitWidth = SCVMinus1.getBitWidth();
19667 M = APInt(BitWidth, SCVMinus1.logBase2());
19668 N = APInt(BitWidth, TrailingZeroes);
19669 return true;
19670 }
19671 return false;
19672 };
19673
19674 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19675 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19676 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19677 APInt CVMinus1 = C - 1;
19678 if (CVMinus1.isNegative())
19679 return false;
19680 unsigned TrailingZeroes = CVMinus1.countr_zero();
19681 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19682 if (CVPlus1.isPowerOf2()) {
19683 unsigned BitWidth = CVPlus1.getBitWidth();
19684 M = APInt(BitWidth, CVPlus1.logBase2());
19685 N = APInt(BitWidth, TrailingZeroes);
19686 return true;
19687 }
19688 return false;
19689 };
19690
19691 if (ConstValue.isNonNegative()) {
19692 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19693 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19694 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19695 // (mul x, (2^M + 1) * (2^N + 1))
19696 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19697 // (mul x, (2^M + 1) * 2^N + 1))
19698 // => MV = add (shl x, M), x); add (shl MV, N), x)
19699 // (mul x, 1 - (1 - 2^M) * 2^N))
19700 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19701 APInt SCVMinus1 = ShiftedConstValue - 1;
19702 APInt SCVPlus1 = ShiftedConstValue + 1;
19703 APInt CVPlus1 = ConstValue + 1;
19704 APInt CVM, CVN;
19705 if (SCVMinus1.isPowerOf2()) {
19706 ShiftAmt = SCVMinus1.logBase2();
19707 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19708 } else if (CVPlus1.isPowerOf2()) {
19709 ShiftAmt = CVPlus1.logBase2();
19710 return Sub(Shl(N0, ShiftAmt), N0);
19711 } else if (SCVPlus1.isPowerOf2()) {
19712 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19713 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19714 }
19715 if (Subtarget->hasALULSLFast() &&
19716 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19717 APInt CVMMinus1 = CVM - 1;
19718 APInt CVNMinus1 = CVN - 1;
19719 unsigned ShiftM1 = CVMMinus1.logBase2();
19720 unsigned ShiftN1 = CVNMinus1.logBase2();
19721 // ALULSLFast implicate that Shifts <= 4 places are fast
19722 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19723 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19724 return Add(Shl(MVal, ShiftN1), MVal);
19725 }
19726 }
19727 if (Subtarget->hasALULSLFast() &&
19728 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19729 unsigned ShiftM = CVM.getZExtValue();
19730 unsigned ShiftN = CVN.getZExtValue();
19731 // ALULSLFast implicate that Shifts <= 4 places are fast
19732 if (ShiftM <= 4 && ShiftN <= 4) {
19733 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19734 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19735 }
19736 }
19737
19738 if (Subtarget->hasALULSLFast() &&
19739 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19740 unsigned ShiftM = CVM.getZExtValue();
19741 unsigned ShiftN = CVN.getZExtValue();
19742 // ALULSLFast implicate that Shifts <= 4 places are fast
19743 if (ShiftM <= 4 && ShiftN <= 4) {
19744 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19745 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19746 }
19747 }
19748 } else {
19749 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19750 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19751 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19752 APInt SCVPlus1 = -ShiftedConstValue + 1;
19753 APInt CVNegPlus1 = -ConstValue + 1;
19754 APInt CVNegMinus1 = -ConstValue - 1;
19755 if (CVNegPlus1.isPowerOf2()) {
19756 ShiftAmt = CVNegPlus1.logBase2();
19757 return Sub(N0, Shl(N0, ShiftAmt));
19758 } else if (CVNegMinus1.isPowerOf2()) {
19759 ShiftAmt = CVNegMinus1.logBase2();
19760 return Negate(Add(Shl(N0, ShiftAmt), N0));
19761 } else if (SCVPlus1.isPowerOf2()) {
19762 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19763 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19764 }
19765 }
19766
19767 return SDValue();
19768}
19769
19771 SelectionDAG &DAG) {
19772 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19773 // optimize away operation when it's from a constant.
19774 //
19775 // The general transformation is:
19776 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19777 // AND(VECTOR_CMP(x,y), constant2)
19778 // constant2 = UNARYOP(constant)
19779
19780 // Early exit if this isn't a vector operation, the operand of the
19781 // unary operation isn't a bitwise AND, or if the sizes of the operations
19782 // aren't the same.
19783 EVT VT = N->getValueType(0);
19784 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19785 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19786 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19787 return SDValue();
19788
19789 // Now check that the other operand of the AND is a constant. We could
19790 // make the transformation for non-constant splats as well, but it's unclear
19791 // that would be a benefit as it would not eliminate any operations, just
19792 // perform one more step in scalar code before moving to the vector unit.
19793 if (BuildVectorSDNode *BV =
19794 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19795 // Bail out if the vector isn't a constant.
19796 if (!BV->isConstant())
19797 return SDValue();
19798
19799 // Everything checks out. Build up the new and improved node.
19800 SDLoc DL(N);
19801 EVT IntVT = BV->getValueType(0);
19802 // Create a new constant of the appropriate type for the transformed
19803 // DAG.
19804 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19805 // The AND node needs bitcasts to/from an integer vector type around it.
19806 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19807 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19808 N->getOperand(0)->getOperand(0), MaskConst);
19809 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19810 return Res;
19811 }
19812
19813 return SDValue();
19814}
19815
19816/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19817/// functions, this can help to reduce the number of fmovs to/from GPRs.
19818static SDValue
19821 const AArch64Subtarget *Subtarget) {
19822 if (N->isStrictFPOpcode())
19823 return SDValue();
19824
19825 if (DCI.isBeforeLegalizeOps())
19826 return SDValue();
19827
19828 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19829 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19830 return SDValue();
19831
19832 auto isSupportedType = [](EVT VT) {
19833 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19834 };
19835
19836 SDValue SrcVal = N->getOperand(0);
19837 EVT SrcTy = SrcVal.getValueType();
19838 EVT DestTy = N->getValueType(0);
19839
19840 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19841 return SDValue();
19842
19843 EVT SrcVecTy;
19844 EVT DestVecTy;
19845 if (DestTy.bitsGT(SrcTy)) {
19846 DestVecTy = getPackedSVEVectorVT(DestTy);
19847 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19848 } else {
19849 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19850 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19851 }
19852
19853 // Ensure the resulting src/dest vector type is legal.
19854 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19855 return SDValue();
19856
19857 SDLoc DL(N);
19858 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19859 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19860 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19861 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19862 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19863}
19864
19867 const AArch64Subtarget *Subtarget) {
19868 // First try to optimize away the conversion when it's conditionally from
19869 // a constant. Vectors only.
19871 return Res;
19872
19873 if (SDValue Res =
19874 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19875 return Res;
19876
19877 EVT VT = N->getValueType(0);
19878 if (VT != MVT::f32 && VT != MVT::f64)
19879 return SDValue();
19880
19881 // Only optimize when the source and destination types have the same width.
19882 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19883 return SDValue();
19884
19885 // If the result of an integer load is only used by an integer-to-float
19886 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19887 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19888 SDValue N0 = N->getOperand(0);
19889 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19890 N0.hasOneUse() &&
19891 // Do not change the width of a volatile load.
19892 !cast<LoadSDNode>(N0)->isVolatile()) {
19893 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19894 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19895 LN0->getPointerInfo(), LN0->getAlign(),
19896 LN0->getMemOperand()->getFlags());
19897
19898 // Make sure successors of the original load stay after it by updating them
19899 // to use the new Chain.
19900 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19901
19902 unsigned Opcode =
19903 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19904 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19905 }
19906
19907 return SDValue();
19908}
19909
19910/// Fold a floating-point multiply by power of two into floating-point to
19911/// fixed-point conversion.
19914 const AArch64Subtarget *Subtarget) {
19915 if (SDValue Res =
19916 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19917 return Res;
19918
19919 if (!Subtarget->isNeonAvailable())
19920 return SDValue();
19921
19922 if (!N->getValueType(0).isSimple())
19923 return SDValue();
19924
19925 SDValue Op = N->getOperand(0);
19926 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19927 return SDValue();
19928
19929 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19930 return SDValue();
19931
19932 SDValue ConstVec = Op->getOperand(1);
19933 if (!isa<BuildVectorSDNode>(ConstVec))
19934 return SDValue();
19935
19936 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19937 uint32_t FloatBits = FloatTy.getSizeInBits();
19938 if (FloatBits != 32 && FloatBits != 64 &&
19939 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19940 return SDValue();
19941
19942 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19943 uint32_t IntBits = IntTy.getSizeInBits();
19944 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19945 return SDValue();
19946
19947 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19948 if (IntBits > FloatBits)
19949 return SDValue();
19950
19951 BitVector UndefElements;
19953 int32_t Bits = IntBits == 64 ? 64 : 32;
19954 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19955 if (C == -1 || C == 0 || C > Bits)
19956 return SDValue();
19957
19958 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19959 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19960 return SDValue();
19961
19962 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19963 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19964 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19965 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19966 return SDValue();
19967 }
19968
19969 SDLoc DL(N);
19970 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19971 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19972 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19973 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19974 SDValue FixConv =
19976 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19977 Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
19978 // We can handle smaller integers by generating an extra trunc.
19979 if (IntBits < FloatBits)
19980 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19981
19982 return FixConv;
19983}
19984
19985// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19986// convert to csel(ccmp(.., cc0)), depending on cc1:
19987
19988// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19989// =>
19990// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19991//
19992// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19993// =>
19994// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19996 EVT VT = N->getValueType(0);
19997 SDValue CSel0 = N->getOperand(0);
19998 SDValue CSel1 = N->getOperand(1);
19999
20000 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20001 CSel1.getOpcode() != AArch64ISD::CSEL)
20002 return SDValue();
20003
20004 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20005 return SDValue();
20006
20007 if (!isNullConstant(CSel0.getOperand(0)) ||
20008 !isOneConstant(CSel0.getOperand(1)) ||
20009 !isNullConstant(CSel1.getOperand(0)) ||
20010 !isOneConstant(CSel1.getOperand(1)))
20011 return SDValue();
20012
20013 SDValue Cmp0 = CSel0.getOperand(3);
20014 SDValue Cmp1 = CSel1.getOperand(3);
20017 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
20018 return SDValue();
20019 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
20020 Cmp0.getOpcode() == AArch64ISD::SUBS) {
20021 std::swap(Cmp0, Cmp1);
20022 std::swap(CC0, CC1);
20023 }
20024
20025 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
20026 return SDValue();
20027
20028 SDLoc DL(N);
20029 SDValue CCmp, Condition;
20030 unsigned NZCV;
20031
20032 if (N->getOpcode() == ISD::AND) {
20034 Condition = getCondCode(DAG, InvCC0);
20036 } else {
20038 Condition = getCondCode(DAG, CC0);
20040 }
20041
20042 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
20043
20044 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
20045 if (Op1 && Op1->getAPIntValue().isNegative() &&
20046 Op1->getAPIntValue().sgt(-32)) {
20047 // CCMP accept the constant int the range [0, 31]
20048 // if the Op1 is a constant in the range [-31, -1], we
20049 // can select to CCMN to avoid the extra mov
20050 SDValue AbsOp1 =
20051 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
20052 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
20053 AbsOp1, NZCVOp, Condition, Cmp0);
20054 } else {
20055 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
20056 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
20057 }
20058 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
20059 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
20060}
20061
20063 const AArch64Subtarget *Subtarget,
20064 const AArch64TargetLowering &TLI) {
20065 SelectionDAG &DAG = DCI.DAG;
20066
20067 if (SDValue R = performANDORCSELCombine(N, DAG))
20068 return R;
20069
20070 return SDValue();
20071}
20072
20074 if (!MemVT.getVectorElementType().isSimple())
20075 return false;
20076
20077 uint64_t MaskForTy = 0ull;
20078 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
20079 case MVT::i8:
20080 MaskForTy = 0xffull;
20081 break;
20082 case MVT::i16:
20083 MaskForTy = 0xffffull;
20084 break;
20085 case MVT::i32:
20086 MaskForTy = 0xffffffffull;
20087 break;
20088 default:
20089 return false;
20090 break;
20091 }
20092
20093 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
20094 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20095 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20096
20097 return false;
20098}
20099
20101 SDValue LeafOp = SDValue(N, 0);
20102 SDValue Op = N->getOperand(0);
20103 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20104 LeafOp.getValueType() != Op.getValueType())
20105 Op = Op->getOperand(0);
20106 if (LeafOp.getValueType() == Op.getValueType())
20107 return Op;
20108 return SDValue();
20109}
20110
20113 SelectionDAG &DAG = DCI.DAG;
20114 SDValue Src = N->getOperand(0);
20115 unsigned Opc = Src->getOpcode();
20116
20117 // Zero/any extend of an unsigned unpack
20118 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20119 SDValue UnpkOp = Src->getOperand(0);
20120 SDValue Dup = N->getOperand(1);
20121
20122 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20123 return SDValue();
20124
20125 SDLoc DL(N);
20127 if (!C)
20128 return SDValue();
20129
20130 uint64_t ExtVal = C->getZExtValue();
20131
20132 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20133 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20134 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20135 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20136 };
20137
20138 // If the mask is fully covered by the unpack, we don't need to push
20139 // a new AND onto the operand
20140 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20141 if (MaskAndTypeMatch(EltTy))
20142 return Src;
20143
20144 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20145 // to see if the mask is all-ones of size MemTy.
20146 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20147 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20148 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20149 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20150 if (MaskAndTypeMatch(EltTy))
20151 return Src;
20152 }
20153
20154 // Truncate to prevent a DUP with an over wide constant
20155 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20156
20157 // Otherwise, make sure we propagate the AND to the operand
20158 // of the unpack
20159 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20160 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20161
20162 SDValue And = DAG.getNode(ISD::AND, DL,
20163 UnpkOp->getValueType(0), UnpkOp, Dup);
20164
20165 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20166 }
20167
20168 if (DCI.isBeforeLegalizeOps())
20169 return SDValue();
20170
20171 // If both sides of AND operations are i1 splat_vectors then
20172 // we can produce just i1 splat_vector as the result.
20173 if (isAllActivePredicate(DAG, N->getOperand(0)))
20174 return N->getOperand(1);
20175 if (isAllActivePredicate(DAG, N->getOperand(1)))
20176 return N->getOperand(0);
20177
20179 return SDValue();
20180
20181 SDValue Mask = N->getOperand(1);
20182
20183 if (!Src.hasOneUse())
20184 return SDValue();
20185
20186 EVT MemVT;
20187
20188 // SVE load instructions perform an implicit zero-extend, which makes them
20189 // perfect candidates for combining.
20190 switch (Opc) {
20191 case AArch64ISD::LD1_MERGE_ZERO:
20192 case AArch64ISD::LDNF1_MERGE_ZERO:
20193 case AArch64ISD::LDFF1_MERGE_ZERO:
20194 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20195 break;
20196 case AArch64ISD::GLD1_MERGE_ZERO:
20197 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20198 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20199 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20200 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20201 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20202 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20203 case AArch64ISD::GLDFF1_MERGE_ZERO:
20204 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20205 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20206 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20207 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20208 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20209 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20210 case AArch64ISD::GLDNT1_MERGE_ZERO:
20211 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20212 break;
20213 default:
20214 return SDValue();
20215 }
20216
20217 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20218 return Src;
20219
20220 return SDValue();
20221}
20222
20223// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20226
20227 // This function performs an optimization on a specific pattern involving
20228 // an AND operation and SETCC (Set Condition Code) node.
20229
20230 SDValue SetCC = N->getOperand(0);
20231 EVT VT = N->getValueType(0);
20232 SelectionDAG &DAG = DCI.DAG;
20233
20234 // Checks if the current node (N) is used by any SELECT instruction and
20235 // returns an empty SDValue to avoid applying the optimization to prevent
20236 // incorrect results
20237 for (auto U : N->users())
20238 if (U->getOpcode() == ISD::SELECT)
20239 return SDValue();
20240
20241 // Check if the operand is a SETCC node with floating-point comparison
20242 if (SetCC.getOpcode() == ISD::SETCC &&
20243 SetCC.getOperand(0).getValueType() == MVT::f32) {
20244
20245 SDValue Cmp;
20247
20248 // Check if the DAG is after legalization and if we can emit the conjunction
20249 if (!DCI.isBeforeLegalize() &&
20250 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20251
20253
20254 SDLoc DL(N);
20255 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20256 DAG.getConstant(0, DL, VT),
20257 getCondCode(DAG, InvertedCC), Cmp);
20258 }
20259 }
20260 return SDValue();
20261}
20262
20265 SelectionDAG &DAG = DCI.DAG;
20266 SDValue LHS = N->getOperand(0);
20267 SDValue RHS = N->getOperand(1);
20268 EVT VT = N->getValueType(0);
20269
20270 if (SDValue R = performANDORCSELCombine(N, DAG))
20271 return R;
20272
20273 if (SDValue R = performANDSETCCCombine(N,DCI))
20274 return R;
20275
20276 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20277 return SDValue();
20278
20279 if (VT.isScalableVector())
20280 return performSVEAndCombine(N, DCI);
20281
20282 // The combining code below works only for NEON vectors. In particular, it
20283 // does not work for SVE when dealing with vectors wider than 128 bits.
20284 if (!VT.is64BitVector() && !VT.is128BitVector())
20285 return SDValue();
20286
20288 if (!BVN)
20289 return SDValue();
20290
20291 // AND does not accept an immediate, so check if we can use a BIC immediate
20292 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20293 // pattern in isel, because some immediates may be lowered to the preferred
20294 // (and x, (movi imm)) form, even though an mvni representation also exists.
20295 APInt DefBits(VT.getSizeInBits(), 0);
20296 APInt UndefBits(VT.getSizeInBits(), 0);
20297 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20298 SDValue NewOp;
20299
20300 // Any bits known to already be 0 need not be cleared again, which can help
20301 // reduce the size of the immediate to one supported by the instruction.
20302 KnownBits Known = DAG.computeKnownBits(LHS);
20303 APInt ZeroSplat(VT.getSizeInBits(), 0);
20304 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20305 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20306 << (Known.Zero.getBitWidth() * I);
20307
20308 DefBits = ~(DefBits | ZeroSplat);
20309 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20310 DefBits, &LHS)) ||
20311 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20312 DefBits, &LHS)))
20313 return NewOp;
20314
20315 UndefBits = ~(UndefBits | ZeroSplat);
20316 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20317 UndefBits, &LHS)) ||
20318 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20319 UndefBits, &LHS)))
20320 return NewOp;
20321 }
20322
20323 return SDValue();
20324}
20325
20328 SelectionDAG &DAG = DCI.DAG;
20329 SDValue LHS = N->getOperand(0);
20330 SDValue RHS = N->getOperand(1);
20331 EVT VT = N->getValueType(0);
20332 SDLoc DL(N);
20333
20334 if (!N->getFlags().hasAllowReassociation())
20335 return SDValue();
20336
20337 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20338 auto ReassocComplex = [&](SDValue A, SDValue B) {
20339 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20340 return SDValue();
20341 unsigned Opc = A.getConstantOperandVal(0);
20342 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20343 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20344 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20345 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20346 return SDValue();
20347 SDValue VCMLA = DAG.getNode(
20348 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20349 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20350 A.getOperand(2), A.getOperand(3));
20351 VCMLA->setFlags(A->getFlags());
20352 return VCMLA;
20353 };
20354 if (SDValue R = ReassocComplex(LHS, RHS))
20355 return R;
20356 if (SDValue R = ReassocComplex(RHS, LHS))
20357 return R;
20358
20359 return SDValue();
20360}
20361
20362static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20363 switch (Opcode) {
20364 case ISD::STRICT_FADD:
20365 case ISD::FADD:
20366 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20367 case ISD::ADD:
20368 return VT == MVT::i64;
20369 default:
20370 return false;
20371 }
20372}
20373
20374static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20376
20378 if ((N.getOpcode() == ISD::SETCC) ||
20379 // get_active_lane_mask is lowered to a whilelo instruction.
20380 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20381 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20382 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20383 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20384 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20385 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20386 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20387 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20388 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20389 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20390 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20391 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20392 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20393 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20394 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20395 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20396 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20397 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20398 return true;
20399
20400 return false;
20401}
20402
20403// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20404// ... into: "ptrue p, all" + PTEST
20405static SDValue
20408 const AArch64Subtarget *Subtarget) {
20409 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20410 // Make sure PTEST can be legalised with illegal types.
20411 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20412 return SDValue();
20413
20414 SDValue N0 = N->getOperand(0);
20415 EVT VT = N0.getValueType();
20416
20417 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20418 !isNullConstant(N->getOperand(1)))
20419 return SDValue();
20420
20421 // Restricted the DAG combine to only cases where we're extracting from a
20422 // flag-setting operation.
20423 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20424 return SDValue();
20425
20426 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20427 SelectionDAG &DAG = DCI.DAG;
20428 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20429 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20430}
20431
20432// Materialize : Idx = (add (mul vscale, NumEls), -1)
20433// i1 = extract_vector_elt t37, Constant:i64<Idx>
20434// ... into: "ptrue p, all" + PTEST
20435static SDValue
20438 const AArch64Subtarget *Subtarget) {
20439 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20440 // Make sure PTEST is legal types.
20441 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20442 return SDValue();
20443
20444 SDValue N0 = N->getOperand(0);
20445 EVT OpVT = N0.getValueType();
20446
20447 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20448 return SDValue();
20449
20450 // Idx == (add (mul vscale, NumEls), -1)
20451 SDValue Idx = N->getOperand(1);
20452 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20453 return SDValue();
20454
20455 SDValue VS = Idx.getOperand(0);
20456 if (VS.getOpcode() != ISD::VSCALE)
20457 return SDValue();
20458
20459 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20460 if (VS.getConstantOperandVal(0) != NumEls)
20461 return SDValue();
20462
20463 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20464 SelectionDAG &DAG = DCI.DAG;
20465 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20466 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20467}
20468
20469static SDValue
20471 const AArch64Subtarget *Subtarget) {
20472 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20473 SelectionDAG &DAG = DCI.DAG;
20474 SDValue Vec = N->getOperand(0);
20475 SDValue Idx = N->getOperand(1);
20476
20477 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20478 return SDValue();
20479
20480 // Only legal for 8, 16, 32, and 64 bit element types.
20481 EVT EltVT = Vec.getValueType().getVectorElementType();
20482 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20483 MVT::bf16, MVT::f32, MVT::f64}),
20484 EltVT.getSimpleVT().SimpleTy))
20485 return SDValue();
20486
20487 SDValue Mask = Idx.getOperand(0);
20488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20489 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20490 return SDValue();
20491
20492 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20493 Vec);
20494}
20495
20496static SDValue
20498 const AArch64Subtarget *Subtarget) {
20499 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20500 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20501 return Res;
20502 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20503 return Res;
20504 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20505 return Res;
20506
20507 SelectionDAG &DAG = DCI.DAG;
20508 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20509
20510 EVT VT = N->getValueType(0);
20511 const bool FullFP16 = Subtarget->hasFullFP16();
20512 bool IsStrict = N0->isStrictFPOpcode();
20513
20514 // extract(dup x) -> x
20515 if (N0.getOpcode() == AArch64ISD::DUP)
20516 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20517 : N0.getOperand(0);
20518
20519 // Rewrite for pairwise fadd pattern
20520 // (f32 (extract_vector_elt
20521 // (fadd (vXf32 Other)
20522 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20523 // ->
20524 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20525 // (extract_vector_elt (vXf32 Other) 1))
20526 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20527 // we can only do this when it's used only by the extract_vector_elt.
20528 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20529 (!IsStrict || N0.hasOneUse())) {
20530 SDLoc DL(N0);
20531 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20532 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20533
20535 SDValue Other = N00;
20536
20537 // And handle the commutative case.
20538 if (!Shuffle) {
20539 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20540 Other = N01;
20541 }
20542
20543 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20544 Other == Shuffle->getOperand(0)) {
20545 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20546 DAG.getConstant(0, DL, MVT::i64));
20547 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20548 DAG.getConstant(1, DL, MVT::i64));
20549 if (!IsStrict)
20550 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20551
20552 // For strict_fadd we need uses of the final extract_vector to be replaced
20553 // with the strict_fadd, but we also need uses of the chain output of the
20554 // original strict_fadd to use the chain output of the new strict_fadd as
20555 // otherwise it may not be deleted.
20556 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20557 {VT, MVT::Other},
20558 {N0->getOperand(0), Extract1, Extract2});
20559 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20560 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20561 return SDValue(N, 0);
20562 }
20563 }
20564
20565 // Given an extract(load) or extract(extend(load)), produce a scalar load
20566 // instead to avoid the cross-register-bank copies.
20567 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
20568 VT.isInteger() && isa<ConstantSDNode>(N1)) {
20569 SDValue LoadN0 = N0;
20570 // Look through sext/zext and extract_subvector / insert_subvector if
20571 // required.
20572 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
20573 N0.getOpcode() == ISD::SIGN_EXTEND ||
20574 N0.getOpcode() == ISD::ANY_EXTEND) &&
20575 N0.getOperand(0).hasOneUse())
20576 LoadN0 = N0.getOperand(0);
20577 unsigned OffsetElts = 0;
20578 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
20579 OffsetElts = LoadN0.getConstantOperandVal(1);
20580 LoadN0 = LoadN0.getOperand(0);
20581 }
20582 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
20583 LoadN0.getOperand(0).isUndef() &&
20584 isNullConstant(LoadN0.getOperand(2)) &&
20585 LoadN0.getOperand(1).hasOneUse())
20586 LoadN0 = LoadN0.getOperand(1);
20587
20588 // Check all the uses are valid and can be scalarized. We check that all the
20589 // uses are extracts and those extracts are not re-inserted into an
20590 // operation best treated as a vector register.
20591 auto Load = dyn_cast<LoadSDNode>(LoadN0);
20592 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
20593 Load->getMemoryVT().isByteSized() &&
20594 all_of(N0->uses(), [&](const SDUse &U) {
20595 return U.getResNo() != N0.getResNo() ||
20596 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20597 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
20598 return U2.getUser()->getOpcode() ==
20599 ISD::INSERT_VECTOR_ELT ||
20600 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
20601 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
20602 }));
20603 })) {
20604
20605 SDLoc DL(Load);
20606
20607 // Generate a new scalar load.
20608 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
20609 Load->getValueType(0).getScalarSizeInBits() / 8;
20610 SDValue BasePtr = DAG.getObjectPtrOffset(
20611 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
20612 ISD::LoadExtType ExtType =
20616 : ISD::EXTLOAD);
20617 SDValue ScalarLoad =
20618 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
20619 Load->getPointerInfo().getWithOffset(Offset),
20620 Load->getValueType(0).getScalarType(),
20621 commonAlignment(Load->getAlign(), Offset),
20622 Load->getMemOperand()->getFlags(), Load->getAAInfo());
20623 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
20624 return ScalarLoad;
20625 }
20626 }
20627
20628 return SDValue();
20629}
20630
20633 SelectionDAG &DAG) {
20634 SDLoc DL(N);
20635 EVT VT = N->getValueType(0);
20636 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20637 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20638
20639 if (VT.isScalableVector())
20640 return SDValue();
20641
20642 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20643 N1Opc == ISD::TRUNCATE) {
20644 SDValue N00 = N0->getOperand(0);
20645 SDValue N10 = N1->getOperand(0);
20646 EVT N00VT = N00.getValueType();
20647 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20648
20649 // Optimize concat_vectors of truncated vectors, where the intermediate
20650 // type is illegal, to avoid said illegality, e.g.,
20651 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20652 // (v2i16 (truncate (v2i64)))))
20653 // ->
20654 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20655 // (v4i32 (bitcast (v2i64))),
20656 // <0, 2, 4, 6>)))
20657 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20658 // on both input and result type, so we might generate worse code.
20659 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20660 if (N00VT == N10.getValueType() &&
20661 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20662 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20663 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20665 for (size_t i = 0; i < Mask.size(); ++i)
20666 Mask[i] = i * 2;
20667 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20668 DAG.getVectorShuffle(
20669 MidVT, DL,
20670 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20671 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20672 }
20673
20674 // Optimize two large shifts and a combine into a single combine and shift
20675 // For AArch64 architectures, sequences like the following:
20676 //
20677 // ushr v0.4s, v0.4s, #20
20678 // ushr v1.4s, v1.4s, #20
20679 // uzp1 v0.8h, v0.8h, v1.8h
20680 //
20681 // Can be optimized to:
20682 //
20683 // uzp2 v0.8h, v0.8h, v1.8h
20684 // ushr v0.8h, v0.8h, #4
20685 //
20686 // This optimization reduces instruction count.
20687 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20688 N00->getOperand(1) == N10->getOperand(1)) {
20689 SDValue N000 = N00->getOperand(0);
20690 SDValue N100 = N10->getOperand(0);
20691 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20692 N101ConstVal = N10->getConstantOperandVal(1),
20693 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20694
20695 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20696 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20697 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20698 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20699 SDValue NewShiftConstant =
20700 DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20701
20702 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20703 }
20704 }
20705 }
20706
20707 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20708 N->getOperand(0).getValueType() == MVT::v2i16 ||
20709 N->getOperand(0).getValueType() == MVT::v2i8) {
20710 EVT SrcVT = N->getOperand(0).getValueType();
20711 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20712 // loads to prevent having to go through the v4i8 load legalization that
20713 // needs to extend each element into a larger type.
20714 if (N->getNumOperands() % 2 == 0 &&
20715 all_of(N->op_values(), [SrcVT](SDValue V) {
20716 if (V.getValueType() != SrcVT)
20717 return false;
20718 if (V.isUndef())
20719 return true;
20720 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20721 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20722 LD->getExtensionType() == ISD::NON_EXTLOAD;
20723 })) {
20724 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20725 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20727
20728 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20729 SDValue V = N->getOperand(i);
20730 if (V.isUndef())
20731 Ops.push_back(DAG.getUNDEF(FVT));
20732 else {
20734 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20735 LD->getBasePtr(), LD->getMemOperand());
20736 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20737 Ops.push_back(NewLoad);
20738 }
20739 }
20740 return DAG.getBitcast(N->getValueType(0),
20741 DAG.getBuildVector(NVT, DL, Ops));
20742 }
20743 }
20744
20745 // Canonicalise concat_vectors to replace concatenations of truncated nots
20746 // with nots of concatenated truncates. This in some cases allows for multiple
20747 // redundant negations to be eliminated.
20748 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20749 // (v4i16 (truncate (not (v4i32)))))
20750 // ->
20751 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20752 // (v4i16 (truncate (v4i32)))))
20753 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20754 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20755 N->isOnlyUserOf(N1.getNode())) {
20756 auto isBitwiseVectorNegate = [](SDValue V) {
20757 return V->getOpcode() == ISD::XOR &&
20758 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20759 };
20760 SDValue N00 = N0->getOperand(0);
20761 SDValue N10 = N1->getOperand(0);
20762 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20763 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20764 return DAG.getNOT(
20765 DL,
20768 N00->getOperand(0)),
20770 N10->getOperand(0))),
20771 VT);
20772 }
20773 }
20774
20775 // Wait till after everything is legalized to try this. That way we have
20776 // legal vector types and such.
20777 if (DCI.isBeforeLegalizeOps())
20778 return SDValue();
20779
20780 // Optimise concat_vectors of two identical binops with a 128-bit destination
20781 // size, combine into an binop of two contacts of the source vectors. eg:
20782 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20783 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20784 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20785 isVectorizedBinOp(N0Opc)) &&
20786 N0->hasOneUse() && N1->hasOneUse()) {
20787 SDValue N00 = N0->getOperand(0);
20788 SDValue N01 = N0->getOperand(1);
20789 SDValue N10 = N1->getOperand(0);
20790 SDValue N11 = N1->getOperand(1);
20791
20792 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20793 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20794 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20795 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20796 }
20797 }
20798
20799 auto IsRSHRN = [](SDValue Shr) {
20800 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20801 return false;
20802 SDValue Op = Shr.getOperand(0);
20803 EVT VT = Op.getValueType();
20804 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20805 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20806 return false;
20807
20808 APInt Imm;
20809 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20810 Imm = APInt(VT.getScalarSizeInBits(),
20811 Op.getOperand(1).getConstantOperandVal(0)
20812 << Op.getOperand(1).getConstantOperandVal(1));
20813 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20814 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20815 Imm = APInt(VT.getScalarSizeInBits(),
20816 Op.getOperand(1).getConstantOperandVal(0));
20817 else
20818 return false;
20819
20820 if (Imm != 1ULL << (ShtAmt - 1))
20821 return false;
20822 return true;
20823 };
20824
20825 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20826 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20827 ((IsRSHRN(N1) &&
20829 N1.isUndef())) {
20830 SDValue X = N0.getOperand(0).getOperand(0);
20831 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20832 : N1.getOperand(0).getOperand(0);
20833 EVT BVT =
20834 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20835 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20836 SDValue Add = DAG.getNode(
20837 ISD::ADD, DL, BVT, CC,
20838 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20839 SDValue Shr =
20840 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20841 return Shr;
20842 }
20843
20844 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20845 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20846 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20847 N0.getOperand(1) == N1.getOperand(1)) {
20848 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20849 DAG.getUNDEF(N0.getValueType()));
20850 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20851 DAG.getUNDEF(N0.getValueType()));
20852 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20853 }
20854
20855 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20856 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20857 // canonicalise to that.
20858 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20859 assert(VT.getScalarSizeInBits() == 64);
20860 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20861 DAG.getConstant(0, DL, MVT::i64));
20862 }
20863
20864 // Canonicalise concat_vectors so that the right-hand vector has as few
20865 // bit-casts as possible before its real operation. The primary matching
20866 // destination for these operations will be the narrowing "2" instructions,
20867 // which depend on the operation being performed on this right-hand vector.
20868 // For example,
20869 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20870 // becomes
20871 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20872
20873 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20874 return SDValue();
20875 SDValue RHS = N1->getOperand(0);
20876 MVT RHSTy = RHS.getValueType().getSimpleVT();
20877 // If the RHS is not a vector, this is not the pattern we're looking for.
20878 if (!RHSTy.isVector())
20879 return SDValue();
20880
20881 LLVM_DEBUG(
20882 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20883
20884 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20885 RHSTy.getVectorNumElements() * 2);
20886 return DAG.getNode(ISD::BITCAST, DL, VT,
20887 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
20888 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
20889 RHS));
20890}
20891
20892static SDValue
20894 SelectionDAG &DAG) {
20895 if (DCI.isBeforeLegalizeOps())
20896 return SDValue();
20897
20898 EVT VT = N->getValueType(0);
20899 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20900 return SDValue();
20901
20902 SDValue V = N->getOperand(0);
20903
20904 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20905 // blocks this combine because the non-const case requires custom lowering.
20906 //
20907 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20908 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20909 if (isa<ConstantSDNode>(V.getOperand(0)))
20910 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20911
20912 return SDValue();
20913}
20914
20915static SDValue
20917 SelectionDAG &DAG) {
20918 SDLoc DL(N);
20919 SDValue Vec = N->getOperand(0);
20920 SDValue SubVec = N->getOperand(1);
20921 uint64_t IdxVal = N->getConstantOperandVal(2);
20922 EVT VecVT = Vec.getValueType();
20923 EVT SubVT = SubVec.getValueType();
20924
20925 // Promote fixed length vector zeros.
20926 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20927 Vec.isUndef() && isZerosVector(SubVec.getNode()))
20928 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
20929 : DAG.getConstantFP(0, DL, VecVT);
20930
20931 // Only do this for legal fixed vector types.
20932 if (!VecVT.isFixedLengthVector() ||
20933 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20934 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20935 return SDValue();
20936
20937 // Ignore widening patterns.
20938 if (IdxVal == 0 && Vec.isUndef())
20939 return SDValue();
20940
20941 // Subvector must be half the width and an "aligned" insertion.
20942 unsigned NumSubElts = SubVT.getVectorNumElements();
20943 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20944 (IdxVal != 0 && IdxVal != NumSubElts))
20945 return SDValue();
20946
20947 // Fold insert_subvector -> concat_vectors
20948 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20949 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20950 SDValue Lo, Hi;
20951 if (IdxVal == 0) {
20952 Lo = SubVec;
20953 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20954 DAG.getVectorIdxConstant(NumSubElts, DL));
20955 } else {
20956 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20957 DAG.getVectorIdxConstant(0, DL));
20958 Hi = SubVec;
20959 }
20960 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20961}
20962
20965 SelectionDAG &DAG) {
20966 // Wait until after everything is legalized to try this. That way we have
20967 // legal vector types and such.
20968 if (DCI.isBeforeLegalizeOps())
20969 return SDValue();
20970 // Transform a scalar conversion of a value from a lane extract into a
20971 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20972 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20973 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20974 //
20975 // The second form interacts better with instruction selection and the
20976 // register allocator to avoid cross-class register copies that aren't
20977 // coalescable due to a lane reference.
20978
20979 // Check the operand and see if it originates from a lane extract.
20980 SDValue Op1 = N->getOperand(1);
20982 return SDValue();
20983
20984 // Yep, no additional predication needed. Perform the transform.
20985 SDValue IID = N->getOperand(0);
20986 SDValue Shift = N->getOperand(2);
20987 SDValue Vec = Op1.getOperand(0);
20988 SDValue Lane = Op1.getOperand(1);
20989 EVT ResTy = N->getValueType(0);
20990 EVT VecResTy;
20991 SDLoc DL(N);
20992
20993 // The vector width should be 128 bits by the time we get here, even
20994 // if it started as 64 bits (the extract_vector handling will have
20995 // done so). Bail if it is not.
20996 if (Vec.getValueSizeInBits() != 128)
20997 return SDValue();
20998
20999 if (Vec.getValueType() == MVT::v4i32)
21000 VecResTy = MVT::v4f32;
21001 else if (Vec.getValueType() == MVT::v2i64)
21002 VecResTy = MVT::v2f64;
21003 else
21004 return SDValue();
21005
21006 SDValue Convert =
21007 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
21008 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
21009}
21010
21011// AArch64 high-vector "long" operations are formed by performing the non-high
21012// version on an extract_subvector of each operand which gets the high half:
21013//
21014// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
21015//
21016// However, there are cases which don't have an extract_high explicitly, but
21017// have another operation that can be made compatible with one for free. For
21018// example:
21019//
21020// (dupv64 scalar) --> (extract_high (dup128 scalar))
21021//
21022// This routine does the actual conversion of such DUPs, once outer routines
21023// have determined that everything else is in order.
21024// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
21025// similarly here.
21027 MVT VT = N.getSimpleValueType();
21028 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21029 N.getConstantOperandVal(1) == 0)
21030 N = N.getOperand(0);
21031
21032 switch (N.getOpcode()) {
21033 case AArch64ISD::DUP:
21034 case AArch64ISD::DUPLANE8:
21035 case AArch64ISD::DUPLANE16:
21036 case AArch64ISD::DUPLANE32:
21037 case AArch64ISD::DUPLANE64:
21038 case AArch64ISD::MOVI:
21039 case AArch64ISD::MOVIshift:
21040 case AArch64ISD::MOVIedit:
21041 case AArch64ISD::MOVImsl:
21042 case AArch64ISD::MVNIshift:
21043 case AArch64ISD::MVNImsl:
21044 break;
21045 default:
21046 // FMOV could be supported, but isn't very useful, as it would only occur
21047 // if you passed a bitcast' floating point immediate to an eligible long
21048 // integer op (addl, smull, ...).
21049 return SDValue();
21050 }
21051
21052 if (!VT.is64BitVector())
21053 return SDValue();
21054
21055 SDLoc DL(N);
21056 unsigned NumElems = VT.getVectorNumElements();
21057 if (N.getValueType().is64BitVector()) {
21058 MVT ElementTy = VT.getVectorElementType();
21059 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
21060 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
21061 }
21062
21063 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
21064 DAG.getConstant(NumElems, DL, MVT::i64));
21065}
21066
21068 if (N.getOpcode() == ISD::BITCAST)
21069 N = N.getOperand(0);
21070 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21071 return false;
21072 if (N.getOperand(0).getValueType().isScalableVector())
21073 return false;
21074 return N.getConstantOperandAPInt(1) ==
21075 N.getOperand(0).getValueType().getVectorNumElements() / 2;
21076}
21077
21078/// Helper structure to keep track of ISD::SET_CC operands.
21084
21085/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
21090
21091/// Helper structure to keep track of SetCC information.
21096
21097/// Helper structure to be able to read SetCC information. If set to
21098/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
21099/// GenericSetCCInfo.
21104
21105/// Check whether or not \p Op is a SET_CC operation, either a generic or
21106/// an
21107/// AArch64 lowered one.
21108/// \p SetCCInfo is filled accordingly.
21109/// \post SetCCInfo is meanginfull only when this function returns true.
21110/// \return True when Op is a kind of SET_CC operation.
21112 // If this is a setcc, this is straight forward.
21113 if (Op.getOpcode() == ISD::SETCC) {
21114 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
21115 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
21116 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
21117 SetCCInfo.IsAArch64 = false;
21118 return true;
21119 }
21120 // Otherwise, check if this is a matching csel instruction.
21121 // In other words:
21122 // - csel 1, 0, cc
21123 // - csel 0, 1, !cc
21124 if (Op.getOpcode() != AArch64ISD::CSEL)
21125 return false;
21126 // Set the information about the operands.
21127 // TODO: we want the operands of the Cmp not the csel
21128 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
21129 SetCCInfo.IsAArch64 = true;
21130 SetCCInfo.Info.AArch64.CC =
21131 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21132
21133 // Check that the operands matches the constraints:
21134 // (1) Both operands must be constants.
21135 // (2) One must be 1 and the other must be 0.
21136 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
21137 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
21138
21139 // Check (1).
21140 if (!TValue || !FValue)
21141 return false;
21142
21143 // Check (2).
21144 if (!TValue->isOne()) {
21145 // Update the comparison when we are interested in !cc.
21146 std::swap(TValue, FValue);
21147 SetCCInfo.Info.AArch64.CC =
21149 }
21150 return TValue->isOne() && FValue->isZero();
21151}
21152
21153// Returns true if Op is setcc or zext of setcc.
21155 if (isSetCC(Op, Info))
21156 return true;
21157 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
21158 isSetCC(Op->getOperand(0), Info));
21159}
21160
21161// The folding we want to perform is:
21162// (add x, [zext] (setcc cc ...) )
21163// -->
21164// (csel x, (add x, 1), !cc ...)
21165//
21166// The latter will get matched to a CSINC instruction.
21168 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
21169 SDValue LHS = Op->getOperand(0);
21170 SDValue RHS = Op->getOperand(1);
21171 SetCCInfoAndKind InfoAndKind;
21172
21173 // If both operands are a SET_CC, then we don't want to perform this
21174 // folding and create another csel as this results in more instructions
21175 // (and higher register usage).
21176 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
21177 isSetCCOrZExtSetCC(RHS, InfoAndKind))
21178 return SDValue();
21179
21180 // If neither operand is a SET_CC, give up.
21181 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
21182 std::swap(LHS, RHS);
21183 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
21184 return SDValue();
21185 }
21186
21187 // FIXME: This could be generatized to work for FP comparisons.
21188 EVT CmpVT = InfoAndKind.IsAArch64
21189 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
21190 : InfoAndKind.Info.Generic.Opnd0->getValueType();
21191 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
21192 return SDValue();
21193
21194 SDValue CCVal;
21195 SDValue Cmp;
21196 SDLoc DL(Op);
21197 if (InfoAndKind.IsAArch64) {
21198 CCVal = DAG.getConstant(
21200 MVT::i32);
21201 Cmp = *InfoAndKind.Info.AArch64.Cmp;
21202 } else
21203 Cmp = getAArch64Cmp(
21204 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
21205 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
21206 DL);
21207
21208 EVT VT = Op->getValueType(0);
21209 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
21210 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
21211}
21212
21213// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
21215 EVT VT = N->getValueType(0);
21216 // Only scalar integer and vector types.
21217 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
21218 return SDValue();
21219
21220 SDValue LHS = N->getOperand(0);
21221 SDValue RHS = N->getOperand(1);
21222 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21223 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
21224 return SDValue();
21225
21226 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
21227 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
21228 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
21229 return SDValue();
21230
21231 SDValue Op1 = LHS->getOperand(0);
21232 SDValue Op2 = RHS->getOperand(0);
21233 EVT OpVT1 = Op1.getValueType();
21234 EVT OpVT2 = Op2.getValueType();
21235 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
21236 Op2.getOpcode() != AArch64ISD::UADDV ||
21237 OpVT1.getVectorElementType() != VT)
21238 return SDValue();
21239
21240 SDValue Val1 = Op1.getOperand(0);
21241 SDValue Val2 = Op2.getOperand(0);
21242 EVT ValVT = Val1->getValueType(0);
21243 SDLoc DL(N);
21244 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
21245 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21246 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
21247 DAG.getConstant(0, DL, MVT::i64));
21248}
21249
21250/// Perform the scalar expression combine in the form of:
21251/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
21252/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
21254 EVT VT = N->getValueType(0);
21255 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
21256 return SDValue();
21257
21258 SDValue LHS = N->getOperand(0);
21259 SDValue RHS = N->getOperand(1);
21260
21261 // Handle commutivity.
21262 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21263 LHS.getOpcode() != AArch64ISD::CSNEG) {
21264 std::swap(LHS, RHS);
21265 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21266 LHS.getOpcode() != AArch64ISD::CSNEG) {
21267 return SDValue();
21268 }
21269 }
21270
21271 if (!LHS.hasOneUse())
21272 return SDValue();
21273
21275 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
21276
21277 // The CSEL should include a const one operand, and the CSNEG should include
21278 // One or NegOne operand.
21279 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
21280 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
21281 if (!CTVal || !CFVal)
21282 return SDValue();
21283
21284 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
21285 (CTVal->isOne() || CFVal->isOne())) &&
21286 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
21287 (CTVal->isOne() || CFVal->isAllOnes())))
21288 return SDValue();
21289
21290 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
21291 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
21292 !CFVal->isOne()) {
21293 std::swap(CTVal, CFVal);
21295 }
21296
21297 SDLoc DL(N);
21298 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
21299 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
21300 !CFVal->isAllOnes()) {
21301 APInt C = -1 * CFVal->getAPIntValue();
21302 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
21303 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
21305 }
21306
21307 // It might be neutral for larger constants, as the immediate need to be
21308 // materialized in a register.
21309 APInt ADDC = CTVal->getAPIntValue();
21310 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21311 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
21312 return SDValue();
21313
21314 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
21315 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
21316 "Unexpected constant value");
21317
21318 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
21319 SDValue CCVal = getCondCode(DAG, AArch64CC);
21320 SDValue Cmp = LHS.getOperand(3);
21321
21322 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
21323}
21324
21325// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21327 EVT VT = N->getValueType(0);
21328 if (N->getOpcode() != ISD::ADD)
21329 return SDValue();
21330
21331 SDValue Dot = N->getOperand(0);
21332 SDValue A = N->getOperand(1);
21333 // Handle commutivity
21334 auto isZeroDot = [](SDValue Dot) {
21335 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21336 Dot.getOpcode() == AArch64ISD::SDOT) &&
21338 };
21339 if (!isZeroDot(Dot))
21340 std::swap(Dot, A);
21341 if (!isZeroDot(Dot))
21342 return SDValue();
21343
21344 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21345 Dot.getOperand(2));
21346}
21347
21349 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21350}
21351
21352// Try to fold
21353//
21354// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21355//
21356// The folding helps csel to be matched with csneg without generating
21357// redundant neg instruction, which includes negation of the csel expansion
21358// of abs node lowered by lowerABS.
21360 if (!isNegatedInteger(SDValue(N, 0)))
21361 return SDValue();
21362
21363 SDValue CSel = N->getOperand(1);
21364 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21365 return SDValue();
21366
21367 SDValue N0 = CSel.getOperand(0);
21368 SDValue N1 = CSel.getOperand(1);
21369
21370 // If neither of them are negations, it's not worth the folding as it
21371 // introduces two additional negations while reducing one negation.
21372 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21373 return SDValue();
21374
21375 SDLoc DL(N);
21376 EVT VT = CSel.getValueType();
21377
21378 SDValue N0N = DAG.getNegative(N0, DL, VT);
21379 SDValue N1N = DAG.getNegative(N1, DL, VT);
21380
21381 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21382 CSel.getOperand(3));
21383}
21384
21385// The basic add/sub long vector instructions have variants with "2" on the end
21386// which act on the high-half of their inputs. They are normally matched by
21387// patterns like:
21388//
21389// (add (zeroext (extract_high LHS)),
21390// (zeroext (extract_high RHS)))
21391// -> uaddl2 vD, vN, vM
21392//
21393// However, if one of the extracts is something like a duplicate, this
21394// instruction can still be used profitably. This function puts the DAG into a
21395// more appropriate form for those patterns to trigger.
21398 SelectionDAG &DAG = DCI.DAG;
21399 if (DCI.isBeforeLegalizeOps())
21400 return SDValue();
21401
21402 MVT VT = N->getSimpleValueType(0);
21403 if (!VT.is128BitVector()) {
21404 if (N->getOpcode() == ISD::ADD)
21405 return performSetccAddFolding(N, DAG);
21406 return SDValue();
21407 }
21408
21409 // Make sure both branches are extended in the same way.
21410 SDValue LHS = N->getOperand(0);
21411 SDValue RHS = N->getOperand(1);
21412 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21413 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21414 LHS.getOpcode() != RHS.getOpcode())
21415 return SDValue();
21416
21417 unsigned ExtType = LHS.getOpcode();
21418
21419 // It's not worth doing if at least one of the inputs isn't already an
21420 // extract, but we don't know which it'll be so we have to try both.
21421 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
21422 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
21423 if (!RHS.getNode())
21424 return SDValue();
21425
21426 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
21427 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
21428 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
21429 if (!LHS.getNode())
21430 return SDValue();
21431
21432 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
21433 }
21434
21435 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
21436}
21437
21438static bool isCMP(SDValue Op) {
21439 return Op.getOpcode() == AArch64ISD::SUBS &&
21440 !Op.getNode()->hasAnyUseOfValue(0);
21441}
21442
21443// (CSEL 1 0 CC Cond) => CC
21444// (CSEL 0 1 CC Cond) => !CC
21445static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21446 if (Op.getOpcode() != AArch64ISD::CSEL)
21447 return std::nullopt;
21448 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21449 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21450 return std::nullopt;
21451 SDValue OpLHS = Op.getOperand(0);
21452 SDValue OpRHS = Op.getOperand(1);
21453 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21454 return CC;
21455 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21456 return getInvertedCondCode(CC);
21457
21458 return std::nullopt;
21459}
21460
21461// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21462// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21463static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21464 SDValue CmpOp = Op->getOperand(2);
21465 if (!isCMP(CmpOp))
21466 return SDValue();
21467
21468 if (IsAdd) {
21469 if (!isOneConstant(CmpOp.getOperand(1)))
21470 return SDValue();
21471 } else {
21472 if (!isNullConstant(CmpOp.getOperand(0)))
21473 return SDValue();
21474 }
21475
21476 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21477 auto CC = getCSETCondCode(CsetOp);
21478 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21479 return SDValue();
21480
21481 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21482 Op->getOperand(0), Op->getOperand(1),
21483 CsetOp.getOperand(3));
21484}
21485
21486// (ADC x 0 cond) => (CINC x HS cond)
21488 SDValue LHS = N->getOperand(0);
21489 SDValue RHS = N->getOperand(1);
21490 SDValue Cond = N->getOperand(2);
21491
21492 if (!isNullConstant(RHS))
21493 return SDValue();
21494
21495 EVT VT = N->getValueType(0);
21496 SDLoc DL(N);
21497
21498 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21500 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21501}
21502
21505 SelectionDAG &DAG) {
21506 SDLoc DL(N);
21507 EVT VT = N->getValueType(0);
21508
21510 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
21511 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
21512 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
21513 if (Elt0->getOpcode() == ISD::FP_ROUND &&
21514 Elt1->getOpcode() == ISD::FP_ROUND &&
21515 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21516 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21517 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
21519 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21520 // Constant index.
21522 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21523 Elt0->getOperand(0)->getOperand(0) ==
21524 Elt1->getOperand(0)->getOperand(0) &&
21525 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
21526 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
21527 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
21528 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
21529 SDValue HighLanes;
21530 if (Elt2->getOpcode() == ISD::UNDEF &&
21531 Elt3->getOpcode() == ISD::UNDEF) {
21532 HighLanes = DAG.getUNDEF(MVT::v2f32);
21533 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
21534 Elt3->getOpcode() == ISD::FP_ROUND &&
21535 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
21536 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
21537 Elt2->getConstantOperandVal(1) ==
21538 Elt3->getConstantOperandVal(1) &&
21539 Elt2->getOperand(0)->getOpcode() ==
21541 Elt3->getOperand(0)->getOpcode() ==
21543 // Constant index.
21544 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
21545 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
21546 Elt2->getOperand(0)->getOperand(0) ==
21547 Elt3->getOperand(0)->getOperand(0) &&
21548 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
21549 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
21550 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
21551 HighLanes =
21552 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
21553 }
21554 if (HighLanes) {
21555 SDValue DoubleToSingleSticky =
21556 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
21557 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
21558 DoubleToSingleSticky, HighLanes);
21559 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
21560 Elt0->getOperand(1));
21561 }
21562 }
21563 }
21564 }
21565
21566 if (VT == MVT::v2f64) {
21567 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21568 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
21569 Elt1->getOpcode() == ISD::FP_EXTEND &&
21571 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21572 Elt0->getOperand(0)->getOperand(0) ==
21573 Elt1->getOperand(0)->getOperand(0) &&
21574 // Constant index.
21576 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21577 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
21578 Elt1->getOperand(0)->getConstantOperandVal(1) &&
21579 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21580 // ResultType's known minimum vector length.
21581 Elt0->getOperand(0)->getConstantOperandVal(1) %
21583 0) {
21584 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
21585 if (SrcVec.getValueType() == MVT::v4f16 ||
21586 SrcVec.getValueType() == MVT::v4bf16) {
21587 SDValue HalfToSingle =
21588 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
21589 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
21590 SDValue Extract = DAG.getNode(
21592 HalfToSingle, SubvectorIdx);
21593 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
21594 }
21595 }
21596 }
21597
21598 // A build vector of two extracted elements is equivalent to an
21599 // extract subvector where the inner vector is any-extended to the
21600 // extract_vector_elt VT.
21601 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
21602 // (extract_elt_iXX_to_i32 vec Idx+1))
21603 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
21604
21605 // For now, only consider the v2i32 case, which arises as a result of
21606 // legalization.
21607 if (VT != MVT::v2i32)
21608 return SDValue();
21609
21610 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21611 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
21612 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21613 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21614 // Constant index.
21615 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21616 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21617 // Both EXTRACT_VECTOR_ELT from same vector...
21618 Elt0->getOperand(0) == Elt1->getOperand(0) &&
21619 // ... and contiguous. First element's index +1 == second element's index.
21620 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
21621 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21622 // ResultType's known minimum vector length.
21623 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
21624 SDValue VecToExtend = Elt0->getOperand(0);
21625 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
21626 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
21627 return SDValue();
21628
21629 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
21630
21631 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
21632 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
21633 SubvectorIdx);
21634 }
21635
21636 return SDValue();
21637}
21638
21639// A special combine for the sqdmulh family of instructions.
21640// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21641// SATURATING_VAL ) can be reduced to sqdmulh(...)
21643
21644 if (N->getOpcode() != ISD::SMIN)
21645 return SDValue();
21646
21647 EVT DestVT = N->getValueType(0);
21648
21649 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21650 DestVT.isScalableVector())
21651 return SDValue();
21652
21653 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21654
21655 if (!Clamp)
21656 return SDValue();
21657
21658 MVT ScalarType;
21659 unsigned ShiftAmt = 0;
21660 switch (Clamp->getSExtValue()) {
21661 case (1ULL << 15) - 1:
21662 ScalarType = MVT::i16;
21663 ShiftAmt = 16;
21664 break;
21665 case (1ULL << 31) - 1:
21666 ScalarType = MVT::i32;
21667 ShiftAmt = 32;
21668 break;
21669 default:
21670 return SDValue();
21671 }
21672
21673 SDValue Sra = N->getOperand(0);
21674 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21675 return SDValue();
21676
21677 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21678 if (!RightShiftVec)
21679 return SDValue();
21680 unsigned SExtValue = RightShiftVec->getSExtValue();
21681
21682 if (SExtValue != (ShiftAmt - 1))
21683 return SDValue();
21684
21685 SDValue Mul = Sra.getOperand(0);
21686 if (Mul.getOpcode() != ISD::MUL)
21687 return SDValue();
21688
21689 SDValue SExt0 = Mul.getOperand(0);
21690 SDValue SExt1 = Mul.getOperand(1);
21691
21692 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21693 SExt1.getOpcode() != ISD::SIGN_EXTEND)
21694 return SDValue();
21695
21696 EVT SExt0Type = SExt0.getOperand(0).getValueType();
21697 EVT SExt1Type = SExt1.getOperand(0).getValueType();
21698
21699 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21700 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21701 SExt0Type.getVectorNumElements() == 1)
21702 return SDValue();
21703
21704 SDLoc DL(N);
21705 SDValue V0 = SExt0.getOperand(0);
21706 SDValue V1 = SExt1.getOperand(0);
21707
21708 // Ensure input vectors are extended to legal types
21709 if (SExt0Type.getFixedSizeInBits() < 64) {
21710 unsigned VecNumElements = SExt0Type.getVectorNumElements();
21711 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21712 VecNumElements);
21713 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21714 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21715 }
21716
21717 SDValue SQDMULH =
21718 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21719
21720 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21721}
21722
21724 if (SDValue V = trySQDMULHCombine(N, DAG)) {
21725 return V;
21726 }
21727
21728 return SDValue();
21729}
21730
21733 SDLoc DL(N);
21734 EVT VT = N->getValueType(0);
21735 SDValue N0 = N->getOperand(0);
21736 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
21737 N0.getOpcode() == AArch64ISD::DUP) {
21738 SDValue Op = N0.getOperand(0);
21739 if (VT.getScalarType() == MVT::i32 &&
21740 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
21741 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
21742 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
21743 }
21744
21745 // Performing the following combine produces a preferable form for ISEL.
21746 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
21748 N0.hasOneUse()) {
21749 SDValue Op = N0.getOperand(0);
21750 SDValue ExtractIndexNode = N0.getOperand(1);
21751 if (!isa<ConstantSDNode>(ExtractIndexNode))
21752 return SDValue();
21753
21754 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
21755 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
21756 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
21757 "Unexpected legalisation result!");
21758
21759 EVT SrcVectorType = Op.getValueType();
21760 // We also assume that SrcVectorType cannot be a V64 (see
21761 // LowerEXTRACT_VECTOR_ELT).
21762 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
21763 "Unexpected legalisation result!");
21764
21765 unsigned ExtractIndex =
21766 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
21767 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
21768
21769 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
21770 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
21771 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
21772 }
21773
21774 return SDValue();
21775}
21776
21777// Check an node is an extend or shift operand
21779 unsigned Opcode = N.getOpcode();
21780 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
21781 EVT SrcVT;
21782 if (Opcode == ISD::SIGN_EXTEND_INREG)
21783 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
21784 else
21785 SrcVT = N.getOperand(0).getValueType();
21786
21787 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
21788 } else if (Opcode == ISD::AND) {
21789 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
21790 if (!CSD)
21791 return false;
21792 uint64_t AndMask = CSD->getZExtValue();
21793 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
21794 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
21795 return isa<ConstantSDNode>(N.getOperand(1));
21796 }
21797
21798 return false;
21799}
21800
21801// (N - Y) + Z --> (Z - Y) + N
21802// when N is an extend or shift operand
21804 SelectionDAG &DAG) {
21805 auto IsOneUseExtend = [](SDValue N) {
21806 return N.hasOneUse() && isExtendOrShiftOperand(N);
21807 };
21808
21809 // DAGCombiner will revert the combination when Z is constant cause
21810 // dead loop. So don't enable the combination when Z is constant.
21811 // If Z is one use shift C, we also can't do the optimization.
21812 // It will falling to self infinite loop.
21813 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
21814 return SDValue();
21815
21816 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21817 return SDValue();
21818
21819 SDValue Shift = SUB.getOperand(0);
21820 if (!IsOneUseExtend(Shift))
21821 return SDValue();
21822
21823 SDLoc DL(N);
21824 EVT VT = N->getValueType(0);
21825
21826 SDValue Y = SUB.getOperand(1);
21827 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
21828 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
21829}
21830
21832 SelectionDAG &DAG) {
21833 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21834 // commutative.
21835 if (N->getOpcode() != ISD::ADD)
21836 return SDValue();
21837
21838 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21839 // shifted register is only available for i32 and i64.
21840 EVT VT = N->getValueType(0);
21841 if (VT != MVT::i32 && VT != MVT::i64)
21842 return SDValue();
21843
21844 SDLoc DL(N);
21845 SDValue LHS = N->getOperand(0);
21846 SDValue RHS = N->getOperand(1);
21847
21848 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
21849 return Val;
21850 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
21851 return Val;
21852
21853 uint64_t LHSImm = 0, RHSImm = 0;
21854 // If both operand are shifted by imm and shift amount is not greater than 4
21855 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21856 // on RHS.
21857 //
21858 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21859 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21860 // with LSL (shift > 4). For the rest of processors, this is no-op for
21861 // performance or correctness.
21862 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21863 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21864 RHSImm > 4 && LHS.hasOneUse())
21865 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21866
21867 return SDValue();
21868}
21869
21870// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21871// This reassociates it back to allow the creation of more mls instructions.
21873 if (N->getOpcode() != ISD::SUB)
21874 return SDValue();
21875
21876 SDValue Add = N->getOperand(1);
21877 SDValue X = N->getOperand(0);
21878 if (Add.getOpcode() != ISD::ADD)
21879 return SDValue();
21880
21881 if (!Add.hasOneUse())
21882 return SDValue();
21884 return SDValue();
21885
21886 SDValue M1 = Add.getOperand(0);
21887 SDValue M2 = Add.getOperand(1);
21888 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21889 M1.getOpcode() != AArch64ISD::UMULL)
21890 return SDValue();
21891 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21892 M2.getOpcode() != AArch64ISD::UMULL)
21893 return SDValue();
21894
21895 EVT VT = N->getValueType(0);
21896 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21897 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21898}
21899
21900// Combine into mla/mls.
21901// This works on the patterns of:
21902// add v1, (mul v2, v3)
21903// sub v1, (mul v2, v3)
21904// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21905// It will transform the add/sub to a scalable version, so that we can
21906// make use of SVE's MLA/MLS that will be generated for that pattern
21907static SDValue
21909 SelectionDAG &DAG = DCI.DAG;
21910 // Make sure that the types are legal
21911 if (!DCI.isAfterLegalizeDAG())
21912 return SDValue();
21913 // Before using SVE's features, check first if it's available.
21914 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21915 return SDValue();
21916
21917 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21918 return SDValue();
21919
21920 if (!N->getValueType(0).isFixedLengthVector())
21921 return SDValue();
21922
21923 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21924 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21925 return SDValue();
21926
21927 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21928 return SDValue();
21929
21930 SDValue MulValue = Op1->getOperand(0);
21931 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21932 return SDValue();
21933
21934 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21935 return SDValue();
21936
21937 EVT ScalableVT = MulValue.getValueType();
21938 if (!ScalableVT.isScalableVector())
21939 return SDValue();
21940
21941 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21942 SDValue NewValue =
21943 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21944 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21945 };
21946
21947 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21948 return res;
21949 else if (N->getOpcode() == ISD::ADD)
21950 return performOpt(N->getOperand(1), N->getOperand(0));
21951
21952 return SDValue();
21953}
21954
21955// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21956// help, for example, to produce ssra from sshr+add.
21958 EVT VT = N->getValueType(0);
21959 if (VT != MVT::i64 ||
21960 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21961 return SDValue();
21962 SDValue Op0 = N->getOperand(0);
21963 SDValue Op1 = N->getOperand(1);
21964
21965 // At least one of the operands should be an extract, and the other should be
21966 // something that is easy to convert to v1i64 type (in this case a load).
21967 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21968 Op0.getOpcode() != ISD::LOAD)
21969 return SDValue();
21970 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21971 Op1.getOpcode() != ISD::LOAD)
21972 return SDValue();
21973
21974 SDLoc DL(N);
21975 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21976 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21977 Op0 = Op0.getOperand(0);
21978 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21979 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21980 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21981 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21982 Op1 = Op1.getOperand(0);
21983 } else
21984 return SDValue();
21985
21986 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21987 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21988 DAG.getConstant(0, DL, MVT::i64));
21989}
21990
21993 if (!BV->hasOneUse())
21994 return false;
21995 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21996 if (!Ld || !Ld->isSimple())
21997 return false;
21998 Loads.push_back(Ld);
21999 return true;
22000 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
22002 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
22003 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
22004 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
22005 return false;
22006 Loads.push_back(Ld);
22007 }
22008 return true;
22009 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
22010 // Try to find a tree of shuffles and concats from how IR shuffles of loads
22011 // are lowered. Note that this only comes up because we do not always visit
22012 // operands before uses. After that is fixed this can be removed and in the
22013 // meantime this is fairly specific to the lowering we expect from IR.
22014 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
22015 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
22016 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
22017 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
22018 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
22019 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
22020 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
22021 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
22022 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
22023 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
22024 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
22025 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22026 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22027 B.getOperand(1).getNumOperands() != 4)
22028 return false;
22029 auto SV1 = cast<ShuffleVectorSDNode>(B);
22030 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
22031 int NumElts = B.getValueType().getVectorNumElements();
22032 int NumSubElts = NumElts / 4;
22033 for (int I = 0; I < NumSubElts; I++) {
22034 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
22035 if (SV1->getMaskElt(I) != I ||
22036 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22037 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
22038 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
22039 return false;
22040 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
22041 if (SV2->getMaskElt(I) != I ||
22042 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22043 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
22044 return false;
22045 }
22046 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
22047 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
22048 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
22049 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
22050 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
22051 !Ld2->isSimple() || !Ld3->isSimple())
22052 return false;
22053 Loads.push_back(Ld0);
22054 Loads.push_back(Ld1);
22055 Loads.push_back(Ld2);
22056 Loads.push_back(Ld3);
22057 return true;
22058 }
22059 return false;
22060}
22061
22063 SelectionDAG &DAG,
22064 unsigned &NumSubLoads) {
22065 if (!Op0.hasOneUse() || !Op1.hasOneUse())
22066 return false;
22067
22068 SmallVector<LoadSDNode *> Loads0, Loads1;
22069 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22070 isLoadOrMultipleLoads(Op1, Loads1)) {
22071 if (NumSubLoads && Loads0.size() != NumSubLoads)
22072 return false;
22073 NumSubLoads = Loads0.size();
22074 return Loads0.size() == Loads1.size() &&
22075 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
22076 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
22077 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
22079 Size / 8, 1);
22080 });
22081 }
22082
22083 if (Op0.getOpcode() != Op1.getOpcode())
22084 return false;
22085
22086 switch (Op0.getOpcode()) {
22087 case ISD::ADD:
22088 case ISD::SUB:
22090 DAG, NumSubLoads) &&
22092 DAG, NumSubLoads);
22093 case ISD::SIGN_EXTEND:
22094 case ISD::ANY_EXTEND:
22095 case ISD::ZERO_EXTEND:
22096 EVT XVT = Op0.getOperand(0).getValueType();
22097 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
22098 XVT.getScalarSizeInBits() != 32)
22099 return false;
22101 DAG, NumSubLoads);
22102 }
22103 return false;
22104}
22105
22106// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
22107// into a single load of twice the size, that we extract the bottom part and top
22108// part so that the shl can use a shll2 instruction. The two loads in that
22109// example can also be larger trees of instructions, which are identical except
22110// for the leaves which are all loads offset from the LHS, including
22111// buildvectors of multiple loads. For example the RHS tree could be
22112// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
22113// Whilst it can be common for the larger loads to replace LDP instructions
22114// (which doesn't gain anything on it's own), the larger loads can help create
22115// more efficient code, and in buildvectors prevent the need for ld1 lane
22116// inserts which can be slower than normal loads.
22118 EVT VT = N->getValueType(0);
22119 if (!VT.isFixedLengthVector() ||
22120 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
22121 VT.getScalarSizeInBits() != 64))
22122 return SDValue();
22123
22124 SDValue Other = N->getOperand(0);
22125 SDValue Shift = N->getOperand(1);
22126 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
22127 std::swap(Shift, Other);
22128 APInt ShiftAmt;
22129 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
22130 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
22131 return SDValue();
22132
22133 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
22134 !ISD::isExtOpcode(Other.getOpcode()) ||
22135 Shift.getOperand(0).getOperand(0).getValueType() !=
22136 Other.getOperand(0).getValueType() ||
22137 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
22138 return SDValue();
22139
22140 SDValue Op0 = Other.getOperand(0);
22141 SDValue Op1 = Shift.getOperand(0).getOperand(0);
22142
22143 unsigned NumSubLoads = 0;
22144 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
22145 return SDValue();
22146
22147 // Attempt to rule out some unprofitable cases using heuristics (some working
22148 // around suboptimal code generation), notably if the extend not be able to
22149 // use ushll2 instructions as the types are not large enough. Otherwise zip's
22150 // will need to be created which can increase the instruction count.
22151 unsigned NumElts = Op0.getValueType().getVectorNumElements();
22152 unsigned NumSubElts = NumElts / NumSubLoads;
22153 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
22154 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
22155 Op0.getValueType().getSizeInBits() < 128 &&
22157 return SDValue();
22158
22159 // Recreate the tree with the new combined loads.
22160 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
22161 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
22162 EVT DVT =
22164
22165 SmallVector<LoadSDNode *> Loads0, Loads1;
22166 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22167 isLoadOrMultipleLoads(Op1, Loads1)) {
22168 EVT LoadVT = EVT::getVectorVT(
22169 *DAG.getContext(), Op0.getValueType().getScalarType(),
22170 Op0.getValueType().getVectorNumElements() / Loads0.size());
22171 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22172
22173 SmallVector<SDValue> NewLoads;
22174 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
22175 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
22176 L0->getBasePtr(), L0->getPointerInfo(),
22177 L0->getBaseAlign());
22178 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
22179 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
22180 NewLoads.push_back(Load);
22181 }
22182 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
22183 }
22184
22186 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
22187 Ops.push_back(GenCombinedTree(O0, O1, DAG));
22188 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
22189 };
22190 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
22191
22192 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
22193 int Hi = NumSubElts, Lo = 0;
22194 for (unsigned i = 0; i < NumSubLoads; i++) {
22195 for (unsigned j = 0; j < NumSubElts; j++) {
22196 LowMask[i * NumSubElts + j] = Lo++;
22197 HighMask[i * NumSubElts + j] = Hi++;
22198 }
22199 Lo += NumSubElts;
22200 Hi += NumSubElts;
22201 }
22202 SDLoc DL(N);
22203 SDValue Ext0, Ext1;
22204 // Extract the top and bottom lanes, then extend the result. Possibly extend
22205 // the result then extract the lanes if the two operands match as it produces
22206 // slightly smaller code.
22207 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
22209 NewOp, DAG.getConstant(0, DL, MVT::i64));
22210 SDValue SubH =
22211 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
22212 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22213 SDValue Extr0 =
22214 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
22215 SDValue Extr1 =
22216 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
22217 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
22218 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
22219 } else {
22221 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
22222 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22223 DAG.getConstant(0, DL, MVT::i64));
22224 SDValue SubH =
22225 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22226 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22227 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
22228 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
22229 }
22230 SDValue NShift =
22231 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
22232 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
22233}
22234
22237 // Try to change sum of two reductions.
22238 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
22239 return Val;
22240 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
22241 return Val;
22242 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
22243 return Val;
22244 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
22245 return Val;
22246 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
22247 return Val;
22249 return Val;
22250 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
22251 return Val;
22252 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
22253 return Val;
22254 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
22255 return Val;
22256
22257 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
22258 return Val;
22259
22260 return performAddSubLongCombine(N, DCI);
22261}
22262
22263// Massage DAGs which we can use the high-half "long" operations on into
22264// something isel will recognize better. E.g.
22265//
22266// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
22267// (aarch64_neon_umull (extract_high (v2i64 vec)))
22268// (extract_high (v2i64 (dup128 scalar)))))
22269//
22272 SelectionDAG &DAG) {
22273 if (DCI.isBeforeLegalizeOps())
22274 return SDValue();
22275
22276 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
22277 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
22278 assert(LHS.getValueType().is64BitVector() &&
22279 RHS.getValueType().is64BitVector() &&
22280 "unexpected shape for long operation");
22281
22282 // Either node could be a DUP, but it's not worth doing both of them (you'd
22283 // just as well use the non-high version) so look for a corresponding extract
22284 // operation on the other "wing".
22287 if (!RHS.getNode())
22288 return SDValue();
22291 if (!LHS.getNode())
22292 return SDValue();
22293 } else
22294 return SDValue();
22295
22296 if (IID == Intrinsic::not_intrinsic)
22297 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
22298
22299 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
22300 N->getOperand(0), LHS, RHS);
22301}
22302
22303static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22304 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
22305 unsigned ElemBits = ElemTy.getSizeInBits();
22306
22307 int64_t ShiftAmount;
22308 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
22309 APInt SplatValue, SplatUndef;
22310 unsigned SplatBitSize;
22311 bool HasAnyUndefs;
22312 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
22313 HasAnyUndefs, ElemBits) ||
22314 SplatBitSize != ElemBits)
22315 return SDValue();
22316
22317 ShiftAmount = SplatValue.getSExtValue();
22318 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22319 ShiftAmount = CVN->getSExtValue();
22320 } else
22321 return SDValue();
22322
22323 // If the shift amount is zero, remove the shift intrinsic.
22324 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22325 return N->getOperand(1);
22326
22327 unsigned Opcode;
22328 bool IsRightShift;
22329 switch (IID) {
22330 default:
22331 llvm_unreachable("Unknown shift intrinsic");
22332 case Intrinsic::aarch64_neon_sqshl:
22333 Opcode = AArch64ISD::SQSHL_I;
22334 IsRightShift = false;
22335 break;
22336 case Intrinsic::aarch64_neon_uqshl:
22337 Opcode = AArch64ISD::UQSHL_I;
22338 IsRightShift = false;
22339 break;
22340 case Intrinsic::aarch64_neon_srshl:
22341 Opcode = AArch64ISD::SRSHR_I;
22342 IsRightShift = true;
22343 break;
22344 case Intrinsic::aarch64_neon_urshl:
22345 Opcode = AArch64ISD::URSHR_I;
22346 IsRightShift = true;
22347 break;
22348 case Intrinsic::aarch64_neon_sqshlu:
22349 Opcode = AArch64ISD::SQSHLU_I;
22350 IsRightShift = false;
22351 break;
22352 case Intrinsic::aarch64_neon_sshl:
22353 case Intrinsic::aarch64_neon_ushl:
22354 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
22355 // left shift for positive shift amounts. For negative shifts we can use a
22356 // VASHR/VLSHR as appropriate.
22357 if (ShiftAmount < 0) {
22358 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
22359 : AArch64ISD::VLSHR;
22360 ShiftAmount = -ShiftAmount;
22361 } else
22362 Opcode = AArch64ISD::VSHL;
22363 IsRightShift = false;
22364 break;
22365 }
22366
22367 EVT VT = N->getValueType(0);
22368 SDValue Op = N->getOperand(1);
22369 SDLoc DL(N);
22370 if (VT == MVT::i64) {
22371 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
22372 VT = MVT::v1i64;
22373 }
22374
22375 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
22376 Op = DAG.getNode(Opcode, DL, VT, Op,
22377 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32, true));
22378 if (N->getValueType(0) == MVT::i64)
22379 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22380 DAG.getConstant(0, DL, MVT::i64));
22381 return Op;
22382 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
22383 Op = DAG.getNode(Opcode, DL, VT, Op,
22384 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
22385 if (N->getValueType(0) == MVT::i64)
22386 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22387 DAG.getConstant(0, DL, MVT::i64));
22388 return Op;
22389 }
22390
22391 return SDValue();
22392}
22393
22394// The CRC32[BH] instructions ignore the high bits of their data operand. Since
22395// the intrinsics must be legal and take an i32, this means there's almost
22396// certainly going to be a zext in the DAG which we can eliminate.
22397static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
22398 SDValue AndN = N->getOperand(2);
22399 if (AndN.getOpcode() != ISD::AND)
22400 return SDValue();
22401
22403 if (!CMask || CMask->getZExtValue() != Mask)
22404 return SDValue();
22405
22406 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
22407 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
22408}
22409
22411 SelectionDAG &DAG) {
22412 SDLoc DL(N);
22413 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
22414 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
22415 N->getOperand(1)),
22416 DAG.getConstant(0, DL, MVT::i64));
22417}
22418
22420 SDLoc DL(N);
22421 SDValue Op1 = N->getOperand(1);
22422 SDValue Op2 = N->getOperand(2);
22423 EVT ScalarTy = Op2.getValueType();
22424 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22425 ScalarTy = MVT::i32;
22426
22427 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
22428 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
22429 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
22430 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
22431 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
22432 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
22433}
22434
22436 SDLoc DL(N);
22437 SDValue Scalar = N->getOperand(3);
22438 EVT ScalarTy = Scalar.getValueType();
22439
22440 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22441 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
22442
22443 SDValue Passthru = N->getOperand(1);
22444 SDValue Pred = N->getOperand(2);
22445 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
22446 Pred, Scalar, Passthru);
22447}
22448
22450 SDLoc DL(N);
22451 LLVMContext &Ctx = *DAG.getContext();
22452 EVT VT = N->getValueType(0);
22453
22454 assert(VT.isScalableVector() && "Expected a scalable vector.");
22455
22456 // Current lowering only supports the SVE-ACLE types.
22458 return SDValue();
22459
22460 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
22461 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
22462 EVT ByteVT =
22463 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
22464
22465 // Convert everything to the domain of EXT (i.e bytes).
22466 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
22467 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
22468 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
22469 DAG.getConstant(ElemSize, DL, MVT::i32));
22470
22471 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
22472 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
22473}
22474
22477 SelectionDAG &DAG) {
22478 if (DCI.isBeforeLegalize())
22479 return SDValue();
22480
22481 SDValue Comparator = N->getOperand(3);
22482 if (Comparator.getOpcode() == AArch64ISD::DUP ||
22483 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
22484 unsigned IID = getIntrinsicID(N);
22485 EVT VT = N->getValueType(0);
22486 EVT CmpVT = N->getOperand(2).getValueType();
22487 SDValue Pred = N->getOperand(1);
22488 SDValue Imm;
22489 SDLoc DL(N);
22490
22491 switch (IID) {
22492 default:
22493 llvm_unreachable("Called with wrong intrinsic!");
22494 break;
22495
22496 // Signed comparisons
22497 case Intrinsic::aarch64_sve_cmpeq_wide:
22498 case Intrinsic::aarch64_sve_cmpne_wide:
22499 case Intrinsic::aarch64_sve_cmpge_wide:
22500 case Intrinsic::aarch64_sve_cmpgt_wide:
22501 case Intrinsic::aarch64_sve_cmplt_wide:
22502 case Intrinsic::aarch64_sve_cmple_wide: {
22503 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22504 int64_t ImmVal = CN->getSExtValue();
22505 if (ImmVal >= -16 && ImmVal <= 15)
22506 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
22507 else
22508 return SDValue();
22509 }
22510 break;
22511 }
22512 // Unsigned comparisons
22513 case Intrinsic::aarch64_sve_cmphs_wide:
22514 case Intrinsic::aarch64_sve_cmphi_wide:
22515 case Intrinsic::aarch64_sve_cmplo_wide:
22516 case Intrinsic::aarch64_sve_cmpls_wide: {
22517 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22518 uint64_t ImmVal = CN->getZExtValue();
22519 if (ImmVal <= 127)
22520 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
22521 else
22522 return SDValue();
22523 }
22524 break;
22525 }
22526 }
22527
22528 if (!Imm)
22529 return SDValue();
22530
22531 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
22532 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
22533 N->getOperand(2), Splat, DAG.getCondCode(CC));
22534 }
22535
22536 return SDValue();
22537}
22538
22541 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22542
22543 SDLoc DL(Op);
22544 assert(Op.getValueType().isScalableVector() &&
22545 TLI.isTypeLegal(Op.getValueType()) &&
22546 "Expected legal scalable vector type!");
22547 assert(Op.getValueType() == Pg.getValueType() &&
22548 "Expected same type for PTEST operands");
22549
22550 // Ensure target specific opcodes are using legal type.
22551 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
22552 SDValue TVal = DAG.getConstant(1, DL, OutVT);
22553 SDValue FVal = DAG.getConstant(0, DL, OutVT);
22554
22555 // Ensure operands have type nxv16i1.
22556 if (Op.getValueType() != MVT::nxv16i1) {
22559 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
22560 else
22561 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
22562 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
22563 }
22564
22565 unsigned PTest = AArch64ISD::PTEST;
22567 PTest = AArch64ISD::PTEST_ANY;
22568 else if (Cond == AArch64CC::FIRST_ACTIVE)
22569 PTest = AArch64ISD::PTEST_FIRST;
22570
22571 // Set condition code (CC) flags.
22572 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
22573
22574 // Convert CC to integer based on requested condition.
22575 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
22576 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
22577 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
22578 return DAG.getZExtOrTrunc(Res, DL, VT);
22579}
22580
22582 SelectionDAG &DAG) {
22583 SDLoc DL(N);
22584
22585 SDValue Pred = N->getOperand(1);
22586 SDValue VecToReduce = N->getOperand(2);
22587
22588 // NOTE: The integer reduction's result type is not always linked to the
22589 // operand's element type so we construct it from the intrinsic's result type.
22590 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
22591 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22592
22593 // SVE reductions set the whole vector register with the first element
22594 // containing the reduction result, which we'll now extract.
22595 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22596 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22597 Zero);
22598}
22599
22601 SelectionDAG &DAG) {
22602 SDLoc DL(N);
22603
22604 SDValue Pred = N->getOperand(1);
22605 SDValue VecToReduce = N->getOperand(2);
22606
22607 EVT ReduceVT = VecToReduce.getValueType();
22608 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22609
22610 // SVE reductions set the whole vector register with the first element
22611 // containing the reduction result, which we'll now extract.
22612 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22613 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22614 Zero);
22615}
22616
22618 SelectionDAG &DAG) {
22619 SDLoc DL(N);
22620
22621 SDValue Pred = N->getOperand(1);
22622 SDValue InitVal = N->getOperand(2);
22623 SDValue VecToReduce = N->getOperand(3);
22624 EVT ReduceVT = VecToReduce.getValueType();
22625
22626 // Ordered reductions use the first lane of the result vector as the
22627 // reduction's initial value.
22628 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22629 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
22630 DAG.getUNDEF(ReduceVT), InitVal, Zero);
22631
22632 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
22633
22634 // SVE reductions set the whole vector register with the first element
22635 // containing the reduction result, which we'll now extract.
22636 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22637 Zero);
22638}
22639
22641 SelectionDAG &DAG) {
22642 if (N->getValueType(0) != MVT::i16)
22643 return SDValue();
22644
22645 SDLoc DL(N);
22646 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
22647 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
22648 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
22649}
22650
22651// If a merged operation has no inactive lanes we can relax it to a predicated
22652// or unpredicated operation, which potentially allows better isel (perhaps
22653// using immediate forms) or relaxing register reuse requirements.
22655 SelectionDAG &DAG, bool UnpredOp = false,
22656 bool SwapOperands = false) {
22657 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
22658 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
22659 SDValue Pg = N->getOperand(1);
22660 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
22661 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
22662
22663 // ISD way to specify an all active predicate.
22664 if (isAllActivePredicate(DAG, Pg)) {
22665 if (UnpredOp)
22666 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
22667
22668 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
22669 }
22670
22671 // FUTURE: SplatVector(true)
22672 return SDValue();
22673}
22674
22675static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22676 SDLoc DL(N);
22677 EVT VT = N->getValueType(0);
22678 SDValue Op1 = N->getOperand(1);
22679 SDValue Op2 = N->getOperand(2);
22680 SDValue Op3 = N->getOperand(3);
22681
22682 switch (IID) {
22683 default:
22684 llvm_unreachable("Called with wrong intrinsic!");
22685 case Intrinsic::aarch64_sve_bsl:
22686 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
22687 case Intrinsic::aarch64_sve_bsl1n:
22688 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
22689 Op2);
22690 case Intrinsic::aarch64_sve_bsl2n:
22691 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
22692 DAG.getNOT(DL, Op2, VT));
22693 case Intrinsic::aarch64_sve_nbsl:
22694 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
22695 VT);
22696 }
22697}
22698
22701 const AArch64Subtarget *Subtarget) {
22702 SelectionDAG &DAG = DCI.DAG;
22703 unsigned IID = getIntrinsicID(N);
22704 switch (IID) {
22705 default:
22706 break;
22707 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22708 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22709 return tryCombineFixedPointConvert(N, DCI, DAG);
22710 case Intrinsic::aarch64_neon_saddv:
22711 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
22712 case Intrinsic::aarch64_neon_uaddv:
22713 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
22714 case Intrinsic::aarch64_neon_sminv:
22715 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
22716 case Intrinsic::aarch64_neon_uminv:
22717 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
22718 case Intrinsic::aarch64_neon_smaxv:
22719 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
22720 case Intrinsic::aarch64_neon_umaxv:
22721 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
22722 case Intrinsic::aarch64_neon_fmax:
22723 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22724 N->getOperand(1), N->getOperand(2));
22725 case Intrinsic::aarch64_neon_fmin:
22726 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22727 N->getOperand(1), N->getOperand(2));
22728 case Intrinsic::aarch64_neon_fmaxnm:
22729 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22730 N->getOperand(1), N->getOperand(2));
22731 case Intrinsic::aarch64_neon_fminnm:
22732 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22733 N->getOperand(1), N->getOperand(2));
22734 case Intrinsic::aarch64_neon_smull:
22735 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22736 N->getOperand(1), N->getOperand(2));
22737 case Intrinsic::aarch64_neon_umull:
22738 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22739 N->getOperand(1), N->getOperand(2));
22740 case Intrinsic::aarch64_neon_pmull:
22741 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22742 N->getOperand(1), N->getOperand(2));
22743 case Intrinsic::aarch64_neon_sqdmull:
22744 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22745 case Intrinsic::aarch64_neon_sqshl:
22746 case Intrinsic::aarch64_neon_uqshl:
22747 case Intrinsic::aarch64_neon_sqshlu:
22748 case Intrinsic::aarch64_neon_srshl:
22749 case Intrinsic::aarch64_neon_urshl:
22750 case Intrinsic::aarch64_neon_sshl:
22751 case Intrinsic::aarch64_neon_ushl:
22752 return tryCombineShiftImm(IID, N, DAG);
22753 case Intrinsic::aarch64_neon_sabd:
22754 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22755 N->getOperand(1), N->getOperand(2));
22756 case Intrinsic::aarch64_neon_uabd:
22757 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22758 N->getOperand(1), N->getOperand(2));
22759 case Intrinsic::aarch64_neon_fcvtzs:
22760 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
22761 case Intrinsic::aarch64_neon_fcvtzu:
22762 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
22763 case Intrinsic::aarch64_neon_fcvtas:
22764 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
22765 case Intrinsic::aarch64_neon_fcvtau:
22766 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
22767 case Intrinsic::aarch64_neon_fcvtms:
22768 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
22769 case Intrinsic::aarch64_neon_fcvtmu:
22770 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
22771 case Intrinsic::aarch64_neon_fcvtns:
22772 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
22773 case Intrinsic::aarch64_neon_fcvtnu:
22774 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
22775 case Intrinsic::aarch64_neon_fcvtps:
22776 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
22777 case Intrinsic::aarch64_neon_fcvtpu:
22778 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
22779 case Intrinsic::aarch64_crc32b:
22780 case Intrinsic::aarch64_crc32cb:
22781 return tryCombineCRC32(0xff, N, DAG);
22782 case Intrinsic::aarch64_crc32h:
22783 case Intrinsic::aarch64_crc32ch:
22784 return tryCombineCRC32(0xffff, N, DAG);
22785 case Intrinsic::aarch64_sve_saddv:
22786 // There is no i64 version of SADDV because the sign is irrelevant.
22787 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
22788 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22789 else
22790 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22791 case Intrinsic::aarch64_sve_uaddv:
22792 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22793 case Intrinsic::aarch64_sve_smaxv:
22794 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22795 case Intrinsic::aarch64_sve_umaxv:
22796 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22797 case Intrinsic::aarch64_sve_sminv:
22798 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22799 case Intrinsic::aarch64_sve_uminv:
22800 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22801 case Intrinsic::aarch64_sve_orv:
22802 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22803 case Intrinsic::aarch64_sve_eorv:
22804 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22805 case Intrinsic::aarch64_sve_andv:
22806 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22807 case Intrinsic::aarch64_sve_index:
22808 return LowerSVEIntrinsicIndex(N, DAG);
22809 case Intrinsic::aarch64_sve_dup:
22810 return LowerSVEIntrinsicDUP(N, DAG);
22811 case Intrinsic::aarch64_sve_dup_x:
22812 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22813 N->getOperand(1));
22814 case Intrinsic::aarch64_sve_ext:
22815 return LowerSVEIntrinsicEXT(N, DAG);
22816 case Intrinsic::aarch64_sve_mul_u:
22817 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22818 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22819 case Intrinsic::aarch64_sve_smulh_u:
22820 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22821 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22822 case Intrinsic::aarch64_sve_umulh_u:
22823 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22824 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22825 case Intrinsic::aarch64_sve_smin_u:
22826 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22827 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22828 case Intrinsic::aarch64_sve_umin_u:
22829 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22830 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22831 case Intrinsic::aarch64_sve_smax_u:
22832 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22833 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22834 case Intrinsic::aarch64_sve_umax_u:
22835 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22836 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22837 case Intrinsic::aarch64_sve_lsl_u:
22838 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22839 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22840 case Intrinsic::aarch64_sve_lsr_u:
22841 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22842 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22843 case Intrinsic::aarch64_sve_asr_u:
22844 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22845 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22846 case Intrinsic::aarch64_sve_fadd_u:
22847 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22848 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22849 case Intrinsic::aarch64_sve_fdiv_u:
22850 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22851 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22852 case Intrinsic::aarch64_sve_fmax_u:
22853 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22854 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22855 case Intrinsic::aarch64_sve_fmaxnm_u:
22856 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22857 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22858 case Intrinsic::aarch64_sve_fmla_u:
22859 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22860 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22861 N->getOperand(2));
22862 case Intrinsic::aarch64_sve_fmin_u:
22863 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22864 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22865 case Intrinsic::aarch64_sve_fminnm_u:
22866 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22867 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22868 case Intrinsic::aarch64_sve_fmul_u:
22869 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22870 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22871 case Intrinsic::aarch64_sve_fsub_u:
22872 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22873 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22874 case Intrinsic::aarch64_sve_add_u:
22875 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22876 N->getOperand(3));
22877 case Intrinsic::aarch64_sve_sub_u:
22878 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22879 N->getOperand(3));
22880 case Intrinsic::aarch64_sve_subr:
22881 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22882 case Intrinsic::aarch64_sve_and_u:
22883 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22884 N->getOperand(3));
22885 case Intrinsic::aarch64_sve_bic_u:
22886 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22887 N->getOperand(2), N->getOperand(3));
22888 case Intrinsic::aarch64_sve_saddwb:
22889 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22890 N->getOperand(1), N->getOperand(2));
22891 case Intrinsic::aarch64_sve_saddwt:
22892 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22893 N->getOperand(1), N->getOperand(2));
22894 case Intrinsic::aarch64_sve_uaddwb:
22895 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22896 N->getOperand(1), N->getOperand(2));
22897 case Intrinsic::aarch64_sve_uaddwt:
22898 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22899 N->getOperand(1), N->getOperand(2));
22900 case Intrinsic::aarch64_sve_eor_u:
22901 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22902 N->getOperand(3));
22903 case Intrinsic::aarch64_sve_orr_u:
22904 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22905 N->getOperand(3));
22906 case Intrinsic::aarch64_sve_sabd_u:
22907 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22908 N->getOperand(2), N->getOperand(3));
22909 case Intrinsic::aarch64_sve_uabd_u:
22910 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22911 N->getOperand(2), N->getOperand(3));
22912 case Intrinsic::aarch64_sve_sdiv_u:
22913 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22914 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22915 case Intrinsic::aarch64_sve_udiv_u:
22916 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22917 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22918 case Intrinsic::aarch64_sve_sqadd:
22919 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22920 case Intrinsic::aarch64_sve_sqsub_u:
22921 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22922 N->getOperand(2), N->getOperand(3));
22923 case Intrinsic::aarch64_sve_uqadd:
22924 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22925 case Intrinsic::aarch64_sve_uqsub_u:
22926 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22927 N->getOperand(2), N->getOperand(3));
22928 case Intrinsic::aarch64_sve_sqadd_x:
22929 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22930 N->getOperand(1), N->getOperand(2));
22931 case Intrinsic::aarch64_sve_sqsub_x:
22932 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22933 N->getOperand(1), N->getOperand(2));
22934 case Intrinsic::aarch64_sve_uqadd_x:
22935 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22936 N->getOperand(1), N->getOperand(2));
22937 case Intrinsic::aarch64_sve_uqsub_x:
22938 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22939 N->getOperand(1), N->getOperand(2));
22940 case Intrinsic::aarch64_sve_asrd:
22941 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22942 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22943 case Intrinsic::aarch64_sve_cmphs:
22944 if (!N->getOperand(2).getValueType().isFloatingPoint())
22945 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22946 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22947 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22948 break;
22949 case Intrinsic::aarch64_sve_cmphi:
22950 if (!N->getOperand(2).getValueType().isFloatingPoint())
22951 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22952 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22953 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22954 break;
22955 case Intrinsic::aarch64_sve_fcmpge:
22956 case Intrinsic::aarch64_sve_cmpge:
22957 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22958 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22959 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22960 break;
22961 case Intrinsic::aarch64_sve_fcmpgt:
22962 case Intrinsic::aarch64_sve_cmpgt:
22963 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22964 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22965 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22966 break;
22967 case Intrinsic::aarch64_sve_fcmpeq:
22968 case Intrinsic::aarch64_sve_cmpeq:
22969 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22970 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22971 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22972 break;
22973 case Intrinsic::aarch64_sve_fcmpne:
22974 case Intrinsic::aarch64_sve_cmpne:
22975 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22976 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22977 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22978 break;
22979 case Intrinsic::aarch64_sve_fcmpuo:
22980 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22981 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22982 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22983 break;
22984 case Intrinsic::aarch64_sve_fadda:
22985 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22986 case Intrinsic::aarch64_sve_faddv:
22987 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22988 case Intrinsic::aarch64_sve_fmaxnmv:
22989 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22990 case Intrinsic::aarch64_sve_fmaxv:
22991 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22992 case Intrinsic::aarch64_sve_fminnmv:
22993 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22994 case Intrinsic::aarch64_sve_fminv:
22995 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22996 case Intrinsic::aarch64_sve_sel:
22997 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22998 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22999 case Intrinsic::aarch64_sve_cmpeq_wide:
23000 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
23001 case Intrinsic::aarch64_sve_cmpne_wide:
23002 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
23003 case Intrinsic::aarch64_sve_cmpge_wide:
23004 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
23005 case Intrinsic::aarch64_sve_cmpgt_wide:
23006 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
23007 case Intrinsic::aarch64_sve_cmplt_wide:
23008 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
23009 case Intrinsic::aarch64_sve_cmple_wide:
23010 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
23011 case Intrinsic::aarch64_sve_cmphs_wide:
23012 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
23013 case Intrinsic::aarch64_sve_cmphi_wide:
23014 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
23015 case Intrinsic::aarch64_sve_cmplo_wide:
23016 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
23017 case Intrinsic::aarch64_sve_cmpls_wide:
23018 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
23019 case Intrinsic::aarch64_sve_ptest_any:
23020 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23022 case Intrinsic::aarch64_sve_ptest_first:
23023 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23025 case Intrinsic::aarch64_sve_ptest_last:
23026 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23028 case Intrinsic::aarch64_sve_whilelo:
23029 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
23030 N->getOperand(1), N->getOperand(2));
23031 case Intrinsic::aarch64_sve_bsl:
23032 case Intrinsic::aarch64_sve_bsl1n:
23033 case Intrinsic::aarch64_sve_bsl2n:
23034 case Intrinsic::aarch64_sve_nbsl:
23035 return combineSVEBitSel(IID, N, DAG);
23036 }
23037 return SDValue();
23038}
23039
23040static bool isCheapToExtend(const SDValue &N) {
23041 unsigned OC = N->getOpcode();
23042 return OC == ISD::LOAD || OC == ISD::MLOAD ||
23044}
23045
23046static SDValue
23048 SelectionDAG &DAG) {
23049 // If we have (sext (setcc A B)) and A and B are cheap to extend,
23050 // we can move the sext into the arguments and have the same result. For
23051 // example, if A and B are both loads, we can make those extending loads and
23052 // avoid an extra instruction. This pattern appears often in VLS code
23053 // generation where the inputs to the setcc have a different size to the
23054 // instruction that wants to use the result of the setcc.
23055 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
23056 N->getOperand(0)->getOpcode() == ISD::SETCC);
23057 const SDValue SetCC = N->getOperand(0);
23058
23059 const SDValue CCOp0 = SetCC.getOperand(0);
23060 const SDValue CCOp1 = SetCC.getOperand(1);
23061 if (!CCOp0->getValueType(0).isInteger() ||
23062 !CCOp1->getValueType(0).isInteger())
23063 return SDValue();
23064
23065 ISD::CondCode Code =
23066 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
23067
23068 ISD::NodeType ExtType =
23069 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23070
23071 if (isCheapToExtend(SetCC.getOperand(0)) &&
23072 isCheapToExtend(SetCC.getOperand(1))) {
23073 const SDValue Ext1 =
23074 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
23075 const SDValue Ext2 =
23076 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
23077
23078 return DAG.getSetCC(
23079 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
23080 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
23081 }
23082
23083 return SDValue();
23084}
23085
23086// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
23087// This comes from interleaved vectorization. It is performed late to capture
23088// uitofp converts too.
23090 SelectionDAG &DAG) {
23091 EVT VT = N->getValueType(0);
23092 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
23093 N->getOpcode() != ISD::ZERO_EXTEND ||
23094 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
23095 return SDValue();
23096
23097 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
23098 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23099 return SDValue();
23100
23101 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
23102 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
23103 if (!Shuffle ||
23104 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
23105 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
23106 return SDValue();
23107
23108 unsigned Idx;
23110 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
23111 // An undef interleave shuffle can come up after other canonicalizations,
23112 // where the shuffle has been converted to
23113 // zext(extract(shuffle b, undef, [u,u,0,4]))
23114 bool IsUndefDeInterleave = false;
23115 if (!IsDeInterleave)
23116 IsUndefDeInterleave =
23117 Shuffle->getOperand(1).isUndef() &&
23118 all_of(
23119 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
23120 [](int M) { return M < 0; }) &&
23122 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
23123 VT.getVectorNumElements() / 2),
23124 4, Idx);
23125 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
23126 return SDValue();
23127 SDLoc DL(N);
23128 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23129 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
23130 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23131 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
23132 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
23133 VT, BC1, BC2);
23134 if ((Idx & 1) == 1)
23135 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
23136 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
23137 return DAG.getNode(
23138 ISD::AND, DL, VT, UZP,
23139 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
23140}
23141
23142// This comes up similar to the above when lowering deinterleaving shuffles from
23143// zexts. We have legalized the operations in the generally case to
23144// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
23145// the extract is to the low half and the uzp is uzp1. There would be an extra
23146// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
23147// there could also be an existing and / shift that can be combined in, either
23148// before of after the extract.
23150 EVT VT = N->getValueType(0);
23151 if (N->getOpcode() != ISD::ZERO_EXTEND ||
23152 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
23153 return SDValue();
23154
23155 SDValue Op = N->getOperand(0);
23156 unsigned ExtOffset = (unsigned)-1;
23157 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23158 ExtOffset = Op.getConstantOperandVal(1);
23159 Op = Op.getOperand(0);
23160 }
23161
23162 unsigned Shift = 0;
23164 Op.getValueType().getScalarSizeInBits());
23165
23166 if (Op.getOpcode() == AArch64ISD::VLSHR) {
23167 Shift = Op.getConstantOperandVal(1);
23168 Op = Op.getOperand(0);
23169 Mask = Mask.lshr(Shift);
23170 }
23171 if (Op.getOpcode() == ISD::AND &&
23172 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
23173 Op = Op.getOperand(0);
23174 Mask = Mask.zext(VT.getScalarSizeInBits());
23175 } else if (Op.getOpcode() == AArch64ISD::BICi) {
23176 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
23177 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23178 Mask = Mask.zext(VT.getScalarSizeInBits());
23179 Op = Op.getOperand(0);
23180 }
23181
23182 if (ExtOffset == (unsigned)-1) {
23183 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23184 ExtOffset = Op.getConstantOperandVal(1);
23185 Op = Op.getOperand(0);
23186 } else
23187 return SDValue();
23188 }
23189 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23190 return SDValue();
23191
23192 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23193 return SDValue();
23194 if (Op.getOpcode() == AArch64ISD::UZP2)
23195 Shift += VT.getScalarSizeInBits() / 2;
23196
23197 SDLoc DL(N);
23198 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23199 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23200 if (Shift != 0)
23201 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23202 DAG.getTargetConstant(Shift, DL, MVT::i32));
23203 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23204}
23205
23208 SelectionDAG &DAG) {
23209 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23210 // we can convert that DUP into another extract_high (of a bigger DUP), which
23211 // helps the backend to decide that an sabdl2 would be useful, saving a real
23212 // extract_high operation.
23213 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23214 N->getOperand(0).getValueType().is64BitVector() &&
23215 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23216 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23217 SDNode *ABDNode = N->getOperand(0).getNode();
23218 SDValue NewABD =
23220 if (!NewABD.getNode())
23221 return SDValue();
23222
23223 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23224 }
23225
23227 return R;
23228 if (SDValue R = performZExtUZPCombine(N, DAG))
23229 return R;
23230
23231 if (N->getValueType(0).isFixedLengthVector() &&
23232 N->getOpcode() == ISD::SIGN_EXTEND &&
23233 N->getOperand(0)->getOpcode() == ISD::SETCC)
23234 return performSignExtendSetCCCombine(N, DCI, DAG);
23235
23236 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23237 // that the top half of the result register must be unused, due to the
23238 // any_extend. This means that we can replace this pattern with (rev16
23239 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23240 // ...)), which is what this pattern would otherwise be lowered to.
23241 // Only apply this optimisation if any_extend in original pattern to i32 or
23242 // i64, because this type will become the input type to REV16 in the new
23243 // pattern, so must be a legitimate REV16 input type.
23244 SDValue Bswap = N->getOperand(0);
23245 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23246 Bswap.getValueType() == MVT::i16 &&
23247 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23248 SDLoc DL(N);
23249 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23250 Bswap->getOperand(0));
23251 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23252 NewAnyExtend);
23253 }
23254
23255 return SDValue();
23256}
23257
23259 SDValue SplatVal, unsigned NumVecElts) {
23260 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23261 Align OrigAlignment = St.getAlign();
23262 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23263
23264 // Create scalar stores. This is at least as good as the code sequence for a
23265 // split unaligned store which is a dup.s, ext.b, and two stores.
23266 // Most of the time the three stores should be replaced by store pair
23267 // instructions (stp).
23268 SDLoc DL(&St);
23269 SDValue BasePtr = St.getBasePtr();
23270 uint64_t BaseOffset = 0;
23271
23272 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23273 SDValue NewST1 =
23274 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23275 OrigAlignment, St.getMemOperand()->getFlags());
23276
23277 // As this in ISel, we will not merge this add which may degrade results.
23278 if (BasePtr->getOpcode() == ISD::ADD &&
23279 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23280 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23281 BasePtr = BasePtr->getOperand(0);
23282 }
23283
23284 unsigned Offset = EltOffset;
23285 while (--NumVecElts) {
23286 Align Alignment = commonAlignment(OrigAlignment, Offset);
23287 SDValue OffsetPtr =
23288 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23289 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23290 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23291 PtrInfo.getWithOffset(Offset), Alignment,
23292 St.getMemOperand()->getFlags());
23293 Offset += EltOffset;
23294 }
23295 return NewST1;
23296}
23297
23298// Returns an SVE type that ContentTy can be trivially sign or zero extended
23299// into.
23300static MVT getSVEContainerType(EVT ContentTy) {
23301 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23302
23303 switch (ContentTy.getSimpleVT().SimpleTy) {
23304 default:
23305 llvm_unreachable("No known SVE container for this MVT type");
23306 case MVT::nxv2i8:
23307 case MVT::nxv2i16:
23308 case MVT::nxv2i32:
23309 case MVT::nxv2i64:
23310 case MVT::nxv2f32:
23311 case MVT::nxv2f64:
23312 return MVT::nxv2i64;
23313 case MVT::nxv4i8:
23314 case MVT::nxv4i16:
23315 case MVT::nxv4i32:
23316 case MVT::nxv4f32:
23317 return MVT::nxv4i32;
23318 case MVT::nxv8i8:
23319 case MVT::nxv8i16:
23320 case MVT::nxv8f16:
23321 case MVT::nxv8bf16:
23322 return MVT::nxv8i16;
23323 case MVT::nxv16i8:
23324 return MVT::nxv16i8;
23325 }
23326}
23327
23329 SDLoc DL(N);
23330 EVT VT = N->getValueType(0);
23331
23333 return SDValue();
23334
23335 EVT ContainerVT = VT;
23336 if (ContainerVT.isInteger())
23337 ContainerVT = getSVEContainerType(ContainerVT);
23338
23339 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23340 SDValue Ops[] = { N->getOperand(0), // Chain
23341 N->getOperand(2), // Pg
23342 N->getOperand(3), // Base
23343 DAG.getValueType(VT) };
23344
23345 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23346 SDValue LoadChain = SDValue(Load.getNode(), 1);
23347
23348 if (ContainerVT.isInteger() && (VT != ContainerVT))
23349 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23350
23351 return DAG.getMergeValues({ Load, LoadChain }, DL);
23352}
23353
23355 SDLoc DL(N);
23356 EVT VT = N->getValueType(0);
23357 EVT PtrTy = N->getOperand(3).getValueType();
23358
23359 EVT LoadVT = VT;
23360 if (VT.isFloatingPoint())
23361 LoadVT = VT.changeTypeToInteger();
23362
23363 auto *MINode = cast<MemIntrinsicSDNode>(N);
23364 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23365 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23366 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23367 MINode->getOperand(2), PassThru,
23368 MINode->getMemoryVT(), MINode->getMemOperand(),
23370
23371 if (VT.isFloatingPoint()) {
23372 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23373 return DAG.getMergeValues(Ops, DL);
23374 }
23375
23376 return L;
23377}
23378
23379template <unsigned Opcode>
23381 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23382 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23383 "Unsupported opcode.");
23384 SDLoc DL(N);
23385 EVT VT = N->getValueType(0);
23386
23387 EVT LoadVT = VT;
23388 if (VT.isFloatingPoint())
23389 LoadVT = VT.changeTypeToInteger();
23390
23391 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23392 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23393 SDValue LoadChain = SDValue(Load.getNode(), 1);
23394
23395 if (VT.isFloatingPoint())
23396 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23397
23398 return DAG.getMergeValues({Load, LoadChain}, DL);
23399}
23400
23402 SDLoc DL(N);
23403 SDValue Data = N->getOperand(2);
23404 EVT DataVT = Data.getValueType();
23405 EVT HwSrcVt = getSVEContainerType(DataVT);
23406 SDValue InputVT = DAG.getValueType(DataVT);
23407
23408 if (DataVT.isFloatingPoint())
23409 InputVT = DAG.getValueType(HwSrcVt);
23410
23411 SDValue SrcNew;
23412 if (Data.getValueType().isFloatingPoint())
23413 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23414 else
23415 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23416
23417 SDValue Ops[] = { N->getOperand(0), // Chain
23418 SrcNew,
23419 N->getOperand(4), // Base
23420 N->getOperand(3), // Pg
23421 InputVT
23422 };
23423
23424 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23425}
23426
23428 SDLoc DL(N);
23429
23430 SDValue Data = N->getOperand(2);
23431 EVT DataVT = Data.getValueType();
23432 EVT PtrTy = N->getOperand(4).getValueType();
23433
23434 if (DataVT.isFloatingPoint())
23435 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
23436
23437 auto *MINode = cast<MemIntrinsicSDNode>(N);
23438 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
23439 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
23440 MINode->getMemoryVT(), MINode->getMemOperand(),
23441 ISD::UNINDEXED, false, false);
23442}
23443
23444/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
23445/// load store optimizer pass will merge them to store pair stores. This should
23446/// be better than a movi to create the vector zero followed by a vector store
23447/// if the zero constant is not re-used, since one instructions and one register
23448/// live range will be removed.
23449///
23450/// For example, the final generated code should be:
23451///
23452/// stp xzr, xzr, [x0]
23453///
23454/// instead of:
23455///
23456/// movi v0.2d, #0
23457/// str q0, [x0]
23458///
23460 SDValue StVal = St.getValue();
23461 EVT VT = StVal.getValueType();
23462
23463 // Avoid scalarizing zero splat stores for scalable vectors.
23464 if (VT.isScalableVector())
23465 return SDValue();
23466
23467 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
23468 // 2, 3 or 4 i32 elements.
23469 int NumVecElts = VT.getVectorNumElements();
23470 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
23471 VT.getVectorElementType().getSizeInBits() == 64) ||
23472 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
23473 VT.getVectorElementType().getSizeInBits() == 32)))
23474 return SDValue();
23475
23476 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
23477 return SDValue();
23478
23479 // If the zero constant has more than one use then the vector store could be
23480 // better since the constant mov will be amortized and stp q instructions
23481 // should be able to be formed.
23482 if (!StVal.hasOneUse())
23483 return SDValue();
23484
23485 // If the store is truncating then it's going down to i16 or smaller, which
23486 // means it can be implemented in a single store anyway.
23487 if (St.isTruncatingStore())
23488 return SDValue();
23489
23490 // If the immediate offset of the address operand is too large for the stp
23491 // instruction, then bail out.
23492 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
23493 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
23495 return SDValue();
23496 }
23497
23498 for (int I = 0; I < NumVecElts; ++I) {
23499 SDValue EltVal = StVal.getOperand(I);
23500 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
23501 return SDValue();
23502 }
23503
23504 // Use a CopyFromReg WZR/XZR here to prevent
23505 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
23506 SDLoc DL(&St);
23507 unsigned ZeroReg;
23508 EVT ZeroVT;
23509 if (VT.getVectorElementType().getSizeInBits() == 32) {
23510 ZeroReg = AArch64::WZR;
23511 ZeroVT = MVT::i32;
23512 } else {
23513 ZeroReg = AArch64::XZR;
23514 ZeroVT = MVT::i64;
23515 }
23516 SDValue SplatVal =
23517 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
23518 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23519}
23520
23521/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
23522/// value. The load store optimizer pass will merge them to store pair stores.
23523/// This has better performance than a splat of the scalar followed by a split
23524/// vector store. Even if the stores are not merged it is four stores vs a dup,
23525/// followed by an ext.b and two stores.
23527 SDValue StVal = St.getValue();
23528 EVT VT = StVal.getValueType();
23529
23530 // Don't replace floating point stores, they possibly won't be transformed to
23531 // stp because of the store pair suppress pass.
23532 if (VT.isFloatingPoint())
23533 return SDValue();
23534
23535 // We can express a splat as store pair(s) for 2 or 4 elements.
23536 unsigned NumVecElts = VT.getVectorNumElements();
23537 if (NumVecElts != 4 && NumVecElts != 2)
23538 return SDValue();
23539
23540 // If the store is truncating then it's going down to i16 or smaller, which
23541 // means it can be implemented in a single store anyway.
23542 if (St.isTruncatingStore())
23543 return SDValue();
23544
23545 // Check that this is a splat.
23546 // Make sure that each of the relevant vector element locations are inserted
23547 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
23548 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
23549 SDValue SplatVal;
23550 for (unsigned I = 0; I < NumVecElts; ++I) {
23551 // Check for insert vector elements.
23552 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
23553 return SDValue();
23554
23555 // Check that same value is inserted at each vector element.
23556 if (I == 0)
23557 SplatVal = StVal.getOperand(1);
23558 else if (StVal.getOperand(1) != SplatVal)
23559 return SDValue();
23560
23561 // Check insert element index.
23563 if (!CIndex)
23564 return SDValue();
23565 uint64_t IndexVal = CIndex->getZExtValue();
23566 if (IndexVal >= NumVecElts)
23567 return SDValue();
23568 IndexNotInserted.reset(IndexVal);
23569
23570 StVal = StVal.getOperand(0);
23571 }
23572 // Check that all vector element locations were inserted to.
23573 if (IndexNotInserted.any())
23574 return SDValue();
23575
23576 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23577}
23578
23580 SelectionDAG &DAG,
23581 const AArch64Subtarget *Subtarget) {
23582
23584 if (S->isVolatile() || S->isIndexed())
23585 return SDValue();
23586
23587 SDValue StVal = S->getValue();
23588 EVT VT = StVal.getValueType();
23589
23590 if (!VT.isFixedLengthVector())
23591 return SDValue();
23592
23593 // If we get a splat of zeros, convert this vector store to a store of
23594 // scalars. They will be merged into store pairs of xzr thereby removing one
23595 // instruction and one register.
23596 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
23597 return ReplacedZeroSplat;
23598
23599 // FIXME: The logic for deciding if an unaligned store should be split should
23600 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
23601 // a call to that function here.
23602
23603 if (!Subtarget->isMisaligned128StoreSlow())
23604 return SDValue();
23605
23606 // Don't split at -Oz.
23608 return SDValue();
23609
23610 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
23611 // those up regresses performance on micro-benchmarks and olden/bh.
23612 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
23613 return SDValue();
23614
23615 // Split unaligned 16B stores. They are terrible for performance.
23616 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
23617 // extensions can use this to mark that it does not want splitting to happen
23618 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
23619 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
23620 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
23621 S->getAlign() <= Align(2))
23622 return SDValue();
23623
23624 // If we get a splat of a scalar convert this vector store to a store of
23625 // scalars. They will be merged into store pairs thereby removing two
23626 // instructions.
23627 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
23628 return ReplacedSplat;
23629
23630 SDLoc DL(S);
23631
23632 // Split VT into two.
23633 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23634 unsigned NumElts = HalfVT.getVectorNumElements();
23635 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23636 DAG.getConstant(0, DL, MVT::i64));
23637 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23638 DAG.getConstant(NumElts, DL, MVT::i64));
23639 SDValue BasePtr = S->getBasePtr();
23640 SDValue NewST1 =
23641 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
23642 S->getAlign(), S->getMemOperand()->getFlags());
23643 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23644 DAG.getConstant(8, DL, MVT::i64));
23645 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
23646 S->getPointerInfo(), S->getAlign(),
23647 S->getMemOperand()->getFlags());
23648}
23649
23651 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
23652
23653 // splice(pg, op1, undef) -> op1
23654 if (N->getOperand(2).isUndef())
23655 return N->getOperand(1);
23656
23657 return SDValue();
23658}
23659
23661 const AArch64Subtarget *Subtarget) {
23662 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
23663 N->getOpcode() == AArch64ISD::UUNPKLO) &&
23664 "Unexpected Opcode!");
23665
23666 // uunpklo/hi undef -> undef
23667 if (N->getOperand(0).isUndef())
23668 return DAG.getUNDEF(N->getValueType(0));
23669
23670 // If this is a masked load followed by an UUNPKLO, fold this into a masked
23671 // extending load. We can do this even if this is already a masked
23672 // {z,}extload.
23673 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
23674 N->getOpcode() == AArch64ISD::UUNPKLO) {
23675 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
23676 SDValue Mask = MLD->getMask();
23677 SDLoc DL(N);
23678
23679 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
23680 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23681 (MLD->getPassThru()->isUndef() ||
23682 isZerosVector(MLD->getPassThru().getNode()))) {
23683 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23684 unsigned PgPattern = Mask->getConstantOperandVal(0);
23685 EVT VT = N->getValueType(0);
23686
23687 // Ensure we can double the size of the predicate pattern
23688 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23689 if (NumElts &&
23690 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
23691 Mask =
23692 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
23693 SDValue PassThru = DAG.getConstant(0, DL, VT);
23694 SDValue NewLoad = DAG.getMaskedLoad(
23695 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
23696 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
23698
23699 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
23700
23701 return NewLoad;
23702 }
23703 }
23704 }
23705
23706 return SDValue();
23707}
23708
23710 if (N->getOpcode() != AArch64ISD::UZP1)
23711 return false;
23712 SDValue Op0 = N->getOperand(0);
23713 EVT SrcVT = Op0->getValueType(0);
23714 EVT DstVT = N->getValueType(0);
23715 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23716 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23717 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23718}
23719
23720// Try to combine rounding shifts where the operands come from an extend, and
23721// the result is truncated and combined into one vector.
23722// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23724 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23725 SDValue Op0 = N->getOperand(0);
23726 SDValue Op1 = N->getOperand(1);
23727 EVT ResVT = N->getValueType(0);
23728
23729 unsigned RshOpc = Op0.getOpcode();
23730 if (RshOpc != AArch64ISD::RSHRNB_I)
23731 return SDValue();
23732
23733 // Same op code and imm value?
23734 SDValue ShiftValue = Op0.getOperand(1);
23735 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23736 return SDValue();
23737
23738 // Same unextended operand value?
23739 SDValue Lo = Op0.getOperand(0);
23740 SDValue Hi = Op1.getOperand(0);
23741 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23742 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23743 return SDValue();
23744 SDValue OrigArg = Lo.getOperand(0);
23745 if (OrigArg != Hi.getOperand(0))
23746 return SDValue();
23747
23748 SDLoc DL(N);
23749 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23750 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23751 ShiftValue);
23752}
23753
23754// Try to simplify:
23755// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23756// t2 = nxv8i16 srl(t1, ShiftValue)
23757// to
23758// t1 = nxv8i16 rshrnb(X, shiftvalue).
23759// rshrnb will zero the top half bits of each element. Therefore, this combine
23760// should only be performed when a following instruction with the rshrnb
23761// as an operand does not care about the top half of each element. For example,
23762// a uzp1 or a truncating store.
23764 const AArch64Subtarget *Subtarget) {
23765 EVT VT = Srl->getValueType(0);
23766 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23767 return SDValue();
23768
23769 EVT ResVT;
23770 if (VT == MVT::nxv8i16)
23771 ResVT = MVT::nxv16i8;
23772 else if (VT == MVT::nxv4i32)
23773 ResVT = MVT::nxv8i16;
23774 else if (VT == MVT::nxv2i64)
23775 ResVT = MVT::nxv4i32;
23776 else
23777 return SDValue();
23778
23779 SDLoc DL(Srl);
23780 unsigned ShiftValue;
23781 SDValue RShOperand;
23782 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23783 return SDValue();
23784 SDValue Rshrnb = DAG.getNode(
23785 AArch64ISD::RSHRNB_I, DL, ResVT,
23786 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23787 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23788}
23789
23791 if (V.getOpcode() != AArch64ISD::NVCAST)
23792 return SDValue();
23793
23794 SDValue Op = V.getOperand(0);
23795 if (!Op.getValueType().isVector() ||
23796 V.getValueType().getVectorElementCount() !=
23797 Op.getValueType().getVectorElementCount() * 2)
23798 return SDValue();
23799
23800 return Op;
23801}
23802
23804 const AArch64Subtarget *Subtarget) {
23805 SDLoc DL(N);
23806 SDValue Op0 = N->getOperand(0);
23807 SDValue Op1 = N->getOperand(1);
23808 EVT ResVT = N->getValueType(0);
23809
23810 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23811 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23813 Op0.getOperand(0) == Op1.getOperand(0)) {
23814
23815 SDValue SourceVec = Op0.getOperand(0);
23816 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23817 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23818 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23819 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23820 EVT OpVT = Op0.getOperand(1).getValueType();
23821 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23822 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23823 DAG.getUNDEF(WidenedResVT));
23824 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23825 DAG.getConstant(0, DL, OpVT));
23826 }
23827 }
23828
23829 // Following optimizations only work with uzp1.
23830 if (N->getOpcode() == AArch64ISD::UZP2)
23831 return SDValue();
23832
23833 // uzp1(x, undef) -> concat(truncate(x), undef)
23834 if (Op1.getOpcode() == ISD::UNDEF) {
23835 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23836 switch (ResVT.getSimpleVT().SimpleTy) {
23837 default:
23838 break;
23839 case MVT::v16i8:
23840 BCVT = MVT::v8i16;
23841 HalfVT = MVT::v8i8;
23842 break;
23843 case MVT::v8i16:
23844 BCVT = MVT::v4i32;
23845 HalfVT = MVT::v4i16;
23846 break;
23847 case MVT::v4i32:
23848 BCVT = MVT::v2i64;
23849 HalfVT = MVT::v2i32;
23850 break;
23851 }
23852 if (BCVT != MVT::Other) {
23853 SDValue BC = DAG.getBitcast(BCVT, Op0);
23854 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23855 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23856 DAG.getUNDEF(HalfVT));
23857 }
23858 }
23859
23860 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23861 return Urshr;
23862
23863 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23864 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23865 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23866 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23867 }
23868 }
23869
23870 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23871 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23872 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23873 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23874 }
23875 }
23876
23877 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23878 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23879 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23880 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23881 SDValue X = PreCast.getOperand(0).getOperand(0);
23882 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23883 }
23884 }
23885 }
23886
23887 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23888 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23889 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23890 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23891 SDValue Z = PreCast.getOperand(0).getOperand(1);
23892 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23893 }
23894 }
23895 }
23896
23897 // These optimizations only work on little endian.
23898 if (!DAG.getDataLayout().isLittleEndian())
23899 return SDValue();
23900
23901 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23902 // Example:
23903 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23904 // to
23905 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23907 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23908 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23909 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23910 Op1.getOperand(0));
23911 }
23912 }
23913
23914 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23915 return SDValue();
23916
23917 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23918 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23919
23920 // truncating uzp1(x, y) -> xtn(concat (x, y))
23921 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23922 EVT Op0Ty = SourceOp0.getValueType();
23923 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23924 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23925 SDValue Concat =
23928 SourceOp0, SourceOp1);
23929 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23930 }
23931 }
23932
23933 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23934 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23935 SourceOp1.getOpcode() != ISD::TRUNCATE)
23936 return SDValue();
23937 SourceOp0 = SourceOp0.getOperand(0);
23938 SourceOp1 = SourceOp1.getOperand(0);
23939
23940 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23941 !SourceOp0.getValueType().isSimple())
23942 return SDValue();
23943
23944 EVT ResultTy;
23945
23946 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23947 case MVT::v2i64:
23948 ResultTy = MVT::v4i32;
23949 break;
23950 case MVT::v4i32:
23951 ResultTy = MVT::v8i16;
23952 break;
23953 case MVT::v8i16:
23954 ResultTy = MVT::v16i8;
23955 break;
23956 default:
23957 return SDValue();
23958 }
23959
23960 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23961 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23962 SDValue UzpResult =
23963 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23964
23965 EVT BitcastResultTy;
23966
23967 switch (ResVT.getSimpleVT().SimpleTy) {
23968 case MVT::v2i32:
23969 BitcastResultTy = MVT::v2i64;
23970 break;
23971 case MVT::v4i16:
23972 BitcastResultTy = MVT::v4i32;
23973 break;
23974 case MVT::v8i8:
23975 BitcastResultTy = MVT::v8i16;
23976 break;
23977 default:
23978 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23979 }
23980
23981 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23982 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23983}
23984
23986 unsigned Opc = N->getOpcode();
23987
23988 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23989 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23990 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23991 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23992 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23993 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23994 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23995 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23996
23997 SDLoc DL(N);
23998 SDValue Chain = N->getOperand(0);
23999 SDValue Pg = N->getOperand(1);
24000 SDValue Base = N->getOperand(2);
24001 SDValue Offset = N->getOperand(3);
24002 SDValue Ty = N->getOperand(4);
24003
24004 EVT ResVT = N->getValueType(0);
24005
24006 const auto OffsetOpc = Offset.getOpcode();
24007 const bool OffsetIsZExt =
24008 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
24009 const bool OffsetIsSExt =
24010 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
24011
24012 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
24013 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
24014 SDValue ExtPg = Offset.getOperand(0);
24015 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
24016 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
24017
24018 // If the predicate for the sign- or zero-extended offset is the
24019 // same as the predicate used for this load and the sign-/zero-extension
24020 // was from a 32-bits...
24021 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
24022 SDValue UnextendedOffset = Offset.getOperand(1);
24023
24024 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
24025 if (Signed)
24026 NewOpc = getSignExtendedGatherOpcode(NewOpc);
24027
24028 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
24029 {Chain, Pg, Base, UnextendedOffset, Ty});
24030 }
24031 }
24032
24033 return SDValue();
24034}
24035
24036/// Optimize a vector shift instruction and its operand if shifted out
24037/// bits are not used.
24039 const AArch64TargetLowering &TLI,
24041 assert(N->getOpcode() == AArch64ISD::VASHR ||
24042 N->getOpcode() == AArch64ISD::VLSHR);
24043
24044 SDValue Op = N->getOperand(0);
24045 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
24046
24047 unsigned ShiftImm = N->getConstantOperandVal(1);
24048 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
24049
24050 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
24051 if (N->getOpcode() == AArch64ISD::VASHR &&
24052 Op.getOpcode() == AArch64ISD::VSHL &&
24053 N->getOperand(1) == Op.getOperand(1))
24054 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
24055 return Op.getOperand(0);
24056
24057 // If the shift is exact, the shifted out bits matter.
24058 if (N->getFlags().hasExact())
24059 return SDValue();
24060
24061 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
24062 APInt DemandedMask = ~ShiftedOutBits;
24063
24064 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
24065 return SDValue(N, 0);
24066
24067 return SDValue();
24068}
24069
24071 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
24072 // This transform works in partnership with performSetCCPunpkCombine to
24073 // remove unnecessary transfer of predicates into standard registers and back
24074 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
24075 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
24076 MVT::i1) {
24077 SDValue CC = N->getOperand(0)->getOperand(0);
24078 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
24079 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
24080 DAG.getVectorIdxConstant(0, SDLoc(N)));
24081 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
24082 }
24083
24084 return SDValue();
24085}
24086
24087/// Target-specific DAG combine function for post-increment LD1 (lane) and
24088/// post-increment LD1R.
24091 bool IsLaneOp) {
24092 if (DCI.isBeforeLegalizeOps())
24093 return SDValue();
24094
24095 SelectionDAG &DAG = DCI.DAG;
24096 EVT VT = N->getValueType(0);
24097
24098 if (!VT.is128BitVector() && !VT.is64BitVector())
24099 return SDValue();
24100
24101 // If it is not LOAD, can not do such combine.
24102 unsigned LoadIdx = IsLaneOp ? 1 : 0;
24103 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
24104 if (!LD)
24105 return SDValue();
24106
24107 // If the Generic combiner already helped form a pre- or post-indexed load,
24108 // skip forming one here.
24109 if (LD->isIndexed())
24110 return SDValue();
24111
24112 // The vector lane must be a constant in the LD1LANE opcode.
24113 SDValue Lane;
24114 if (IsLaneOp) {
24115 Lane = N->getOperand(2);
24116 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
24117 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
24118 return SDValue();
24119 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
24120 return SDValue();
24121 }
24122
24123 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
24124 EVT MemVT = LoadSDN->getMemoryVT();
24125 // Check if memory operand is the same type as the vector element.
24126 if (MemVT != VT.getVectorElementType())
24127 return SDValue();
24128
24129 // Check if there are other uses. If so, do not combine as it will introduce
24130 // an extra load.
24131 for (SDUse &U : LD->uses()) {
24132 if (U.getResNo() == 1) // Ignore uses of the chain result.
24133 continue;
24134 if (U.getUser() != N)
24135 return SDValue();
24136 }
24137
24138 // If there is one use and it can splat the value, prefer that operation.
24139 // TODO: This could be expanded to more operations if they reliably use the
24140 // index variants.
24141 if (N->hasOneUse()) {
24142 unsigned UseOpc = N->user_begin()->getOpcode();
24143 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
24144 return SDValue();
24145 }
24146
24147 SDValue Addr = LD->getOperand(1);
24148 SDValue Vector = N->getOperand(0);
24149 // Search for a use of the address operand that is an increment.
24150 for (SDUse &Use : Addr->uses()) {
24151 SDNode *User = Use.getUser();
24152 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24153 continue;
24154
24155 // If the increment is a constant, it must match the memory ref size.
24156 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24157 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24158 uint32_t IncVal = CInc->getZExtValue();
24159 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
24160 if (IncVal != NumBytes)
24161 continue;
24162 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24163 }
24164
24165 // To avoid cycle construction make sure that neither the load nor the add
24166 // are predecessors to each other or the Vector.
24169 Visited.insert(Addr.getNode());
24170 Worklist.push_back(User);
24171 Worklist.push_back(LD);
24172 Worklist.push_back(Vector.getNode());
24173 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
24174 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24175 continue;
24176
24178 Ops.push_back(LD->getOperand(0)); // Chain
24179 if (IsLaneOp) {
24180 Ops.push_back(Vector); // The vector to be inserted
24181 Ops.push_back(Lane); // The lane to be inserted in the vector
24182 }
24183 Ops.push_back(Addr);
24184 Ops.push_back(Inc);
24185
24186 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24187 SDVTList SDTys = DAG.getVTList(Tys);
24188 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24189 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24190 MemVT,
24191 LoadSDN->getMemOperand());
24192
24193 // Update the uses.
24194 SDValue NewResults[] = {
24195 SDValue(LD, 0), // The result of load
24196 SDValue(UpdN.getNode(), 2) // Chain
24197 };
24198 DCI.CombineTo(LD, NewResults);
24199 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24200 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24201
24202 break;
24203 }
24204 return SDValue();
24205}
24206
24207/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24208/// address translation.
24209static bool performTBISimplification(SDValue Addr,
24211 SelectionDAG &DAG) {
24212 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
24213 KnownBits Known;
24215 !DCI.isBeforeLegalizeOps());
24216 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24217 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24218 DCI.CommitTargetLoweringOpt(TLO);
24219 return true;
24220 }
24221 return false;
24222}
24223
24224static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
24225 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24226 "Expected STORE dag node in input!");
24227
24228 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24229 if (!Store->isTruncatingStore() || Store->isIndexed())
24230 return SDValue();
24231 SDValue Ext = Store->getValue();
24232 auto ExtOpCode = Ext.getOpcode();
24233 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24234 ExtOpCode != ISD::ANY_EXTEND)
24235 return SDValue();
24236 SDValue Orig = Ext->getOperand(0);
24237 if (Store->getMemoryVT() != Orig.getValueType())
24238 return SDValue();
24239 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24240 Store->getBasePtr(), Store->getMemOperand());
24241 }
24242
24243 return SDValue();
24244}
24245
24246// A custom combine to lower load <3 x i8> as the more efficient sequence
24247// below:
24248// ldrb wX, [x0, #2]
24249// ldrh wY, [x0]
24250// orr wX, wY, wX, lsl #16
24251// fmov s0, wX
24252//
24253// Note that an alternative sequence with even fewer (although usually more
24254// complex/expensive) instructions would be:
24255// ld1r.4h { v0 }, [x0], #2
24256// ld1.b { v0 }[2], [x0]
24257//
24258// Generating this sequence unfortunately results in noticeably worse codegen
24259// for code that extends the loaded v3i8, due to legalization breaking vector
24260// shuffle detection in a way that is very difficult to work around.
24261// TODO: Revisit once v3i8 legalization has been improved in general.
24262static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
24263 EVT MemVT = LD->getMemoryVT();
24264 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24265 LD->getBaseAlign() >= 4)
24266 return SDValue();
24267
24268 SDLoc DL(LD);
24270 SDValue Chain = LD->getChain();
24271 SDValue BasePtr = LD->getBasePtr();
24272 MachineMemOperand *MMO = LD->getMemOperand();
24273 assert(LD->getOffset().isUndef() && "undef offset expected");
24274
24275 // Load 2 x i8, then 1 x i8.
24276 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24277 TypeSize Offset2 = TypeSize::getFixed(2);
24278 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24279 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24280 MF.getMachineMemOperand(MMO, 2, 1));
24281
24282 // Extend to i32.
24283 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24284 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24285
24286 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24287 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24288 DAG.getConstant(16, DL, MVT::i32));
24289 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24290 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24291
24292 // Extract v3i8 again.
24293 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24294 DAG.getConstant(0, DL, MVT::i64));
24296 ISD::TokenFactor, DL, MVT::Other,
24297 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24298 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24299}
24300
24301// Perform TBI simplification if supported by the target and try to break up
24302// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24303// load instructions can be selected.
24304static SDValue performLOADCombine(SDNode *N,
24306 SelectionDAG &DAG,
24307 const AArch64Subtarget *Subtarget) {
24308 if (Subtarget->supportsAddressTopByteIgnored())
24309 performTBISimplification(N->getOperand(1), DCI, DAG);
24310
24312 EVT RegVT = LD->getValueType(0);
24313 EVT MemVT = LD->getMemoryVT();
24314 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24315 SDLoc DL(LD);
24316
24317 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24318 unsigned AddrSpace = LD->getAddressSpace();
24319 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24320 AddrSpace == ARM64AS::PTR32_UPTR) {
24321 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24322 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24323 SDValue Cast =
24324 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24325 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24326 Cast, LD->getPointerInfo(), MemVT,
24327 LD->getBaseAlign(),
24328 LD->getMemOperand()->getFlags());
24329 }
24330 }
24331
24332 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24333 return SDValue(N, 0);
24334
24335 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24336 return Res;
24337
24338 if (!LD->isNonTemporal())
24339 return SDValue(N, 0);
24340
24341 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24342 MemVT.getSizeInBits() % 256 == 0 ||
24343 256 % MemVT.getScalarSizeInBits() != 0)
24344 return SDValue(N, 0);
24345
24346 SDValue Chain = LD->getChain();
24347 SDValue BasePtr = LD->getBasePtr();
24348 SDNodeFlags Flags = LD->getFlags();
24350 SmallVector<SDValue, 4> LoadOpsChain;
24351 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24352 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24353 // loads and reduce the amount of load instructions generated.
24354 MVT NewVT =
24356 256 / MemVT.getVectorElementType().getSizeInBits());
24357 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24358 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24359 for (unsigned I = 0; I < Num256Loads; I++) {
24360 unsigned PtrOffset = I * 32;
24361 SDValue NewPtr = DAG.getMemBasePlusOffset(
24362 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24363 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24364 SDValue NewLoad = DAG.getLoad(
24365 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24366 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24367 LoadOps.push_back(NewLoad);
24368 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24369 }
24370
24371 // Process remaining bits of the load operation.
24372 // This is done by creating an UNDEF vector to match the size of the
24373 // 256-bit loads and inserting the remaining load to it. We extract the
24374 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24375 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24376 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24377 MVT RemainingVT = MVT::getVectorVT(
24379 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24380 SDValue NewPtr = DAG.getMemBasePlusOffset(
24381 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24382 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24383 SDValue RemainingLoad =
24384 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24385 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24386 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24387 SDValue UndefVector = DAG.getUNDEF(NewVT);
24388 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24389 SDValue ExtendedRemainingLoad =
24390 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24391 {UndefVector, RemainingLoad, InsertIdx});
24392 LoadOps.push_back(ExtendedRemainingLoad);
24393 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24394 EVT ConcatVT =
24396 LoadOps.size() * NewVT.getVectorNumElements());
24397 SDValue ConcatVectors =
24398 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24399 // Extract the original vector type size.
24400 SDValue ExtractSubVector =
24401 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24402 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24404 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24405 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24406}
24407
24408static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
24409 EVT VecVT = Op.getValueType();
24410 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24411 "Need boolean vector type.");
24412
24413 if (Depth > 3)
24415
24416 // We can get the base type from a vector compare or truncate.
24417 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24418 return Op.getOperand(0).getValueType();
24419
24420 // If an operand is a bool vector, continue looking.
24422 for (SDValue Operand : Op->op_values()) {
24423 if (Operand.getValueType() != VecVT)
24424 continue;
24425
24426 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24427 if (!BaseVT.isSimple())
24428 BaseVT = OperandVT;
24429 else if (OperandVT != BaseVT)
24431 }
24432
24433 return BaseVT;
24434}
24435
24436// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
24437// iN, we can use a trick that extracts the i^th bit from the i^th element and
24438// then performs a vector add to get a scalar bitmask. This requires that each
24439// element's bits are either all 1 or all 0.
24440static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
24441 SDLoc DL(N);
24442 SDValue ComparisonResult(N, 0);
24443 EVT VecVT = ComparisonResult.getValueType();
24444 assert(VecVT.isVector() && "Must be a vector type");
24445
24446 unsigned NumElts = VecVT.getVectorNumElements();
24447 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
24448 return SDValue();
24449
24450 if (VecVT.getVectorElementType() != MVT::i1 &&
24451 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
24452 return SDValue();
24453
24454 // If we can find the original types to work on instead of a vector of i1,
24455 // we can avoid extend/extract conversion instructions.
24456 if (VecVT.getVectorElementType() == MVT::i1) {
24457 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
24458 if (!VecVT.isSimple()) {
24459 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
24460 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
24461 }
24462 }
24463 VecVT = VecVT.changeVectorElementTypeToInteger();
24464
24465 // Large vectors don't map directly to this conversion, so to avoid too many
24466 // edge cases, we don't apply it here. The conversion will likely still be
24467 // applied later via multiple smaller vectors, whose results are concatenated.
24468 if (VecVT.getSizeInBits() > 128)
24469 return SDValue();
24470
24471 // Ensure that all elements' bits are either 0s or 1s.
24472 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
24473
24474 bool IsLE = DAG.getDataLayout().isLittleEndian();
24475 SmallVector<SDValue, 16> MaskConstants;
24477 VecVT == MVT::v16i8) {
24478 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
24479 // per entry. We split it into two halves, apply the mask, zip the halves to
24480 // create 8x 16-bit values, and the perform the vector reduce.
24481 for (unsigned Half = 0; Half < 2; ++Half) {
24482 for (unsigned I = 0; I < 8; ++I) {
24483 // On big-endian targets, the lane order in sub-byte vector elements
24484 // gets reversed, so we need to flip the bit index.
24485 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
24486 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
24487 }
24488 }
24489 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24490 SDValue RepresentativeBits =
24491 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24492
24493 SDValue UpperRepresentativeBits =
24494 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
24495 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
24496 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
24497 RepresentativeBits, UpperRepresentativeBits);
24498 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
24499 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
24500 }
24501
24502 // All other vector sizes.
24503 unsigned NumEl = VecVT.getVectorNumElements();
24504 for (unsigned I = 0; I < NumEl; ++I) {
24505 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
24506 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
24507 }
24508
24509 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24510 SDValue RepresentativeBits =
24511 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24512 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
24513 NumElts, VecVT.getVectorElementType().getSizeInBits()));
24514 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
24515}
24516
24517static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
24518 StoreSDNode *Store) {
24519 if (!Store->isTruncatingStore())
24520 return SDValue();
24521
24522 SDLoc DL(Store);
24523 SDValue VecOp = Store->getValue();
24524 EVT VT = VecOp.getValueType();
24525 EVT MemVT = Store->getMemoryVT();
24526
24527 if (!MemVT.isVector() || !VT.isVector() ||
24528 MemVT.getVectorElementType() != MVT::i1)
24529 return SDValue();
24530
24531 // If we are storing a vector that we are currently building, let
24532 // `scalarizeVectorStore()` handle this more efficiently.
24533 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
24534 return SDValue();
24535
24536 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
24537 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
24538 if (!VectorBits)
24539 return SDValue();
24540
24541 EVT StoreVT =
24543 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
24544 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
24545 Store->getMemOperand());
24546}
24547
24548// Combine store (fp_to_int X) to use vector semantics around the conversion
24549// when NEON is available. This allows us to store the in-vector result directly
24550// without transferring the result into a GPR in the process.
24551static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
24553 SelectionDAG &DAG,
24554 const AArch64Subtarget *Subtarget) {
24555 // Limit to post-legalization in order to avoid peeling truncating stores.
24556 if (DCI.isBeforeLegalize())
24557 return SDValue();
24558 if (!Subtarget->isNeonAvailable())
24559 return SDValue();
24560 // Source operand is already a vector.
24561 SDValue Value = ST->getValue();
24562 if (Value.getValueType().isVector())
24563 return SDValue();
24564
24565 // Look through potential assertions.
24566 while (Value->isAssert())
24567 Value = Value.getOperand(0);
24568
24569 if (Value.getOpcode() != ISD::FP_TO_SINT &&
24570 Value.getOpcode() != ISD::FP_TO_UINT)
24571 return SDValue();
24572 if (!Value->hasOneUse())
24573 return SDValue();
24574
24575 SDValue FPSrc = Value.getOperand(0);
24576 EVT SrcVT = FPSrc.getValueType();
24577 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24578 return SDValue();
24579
24580 // No support for assignments such as i64 = fp_to_sint i32
24581 EVT VT = Value.getSimpleValueType();
24582 if (VT != SrcVT.changeTypeToInteger())
24583 return SDValue();
24584
24585 // Create a 128-bit element vector to avoid widening. The floating point
24586 // conversion is transformed into a single element conversion via a pattern.
24587 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24588 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24589 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24590 SDLoc DL(ST);
24591 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24592 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24593
24595 SDValue Extracted =
24596 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24597
24598 DCI.CombineTo(ST->getValue().getNode(), Extracted);
24599 return SDValue(ST, 0);
24600}
24601
24602bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
24603 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
24604 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
24605 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
24606}
24607
24608// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
24609static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
24610 const AArch64Subtarget *Subtarget) {
24611 SDValue Value = ST->getValue();
24612 EVT ValueVT = Value.getValueType();
24613
24614 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
24615 Value.getOpcode() != ISD::TRUNCATE ||
24616 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
24617 return SDValue();
24618
24619 assert(ST->getOffset().isUndef() && "undef offset expected");
24620 SDLoc DL(ST);
24621 auto WideVT = EVT::getVectorVT(
24622 *DAG.getContext(),
24623 Value->getOperand(0).getValueType().getVectorElementType(), 4);
24624 SDValue UndefVector = DAG.getUNDEF(WideVT);
24625 SDValue WideTrunc = DAG.getNode(
24626 ISD::INSERT_SUBVECTOR, DL, WideVT,
24627 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
24628 SDValue Cast = DAG.getNode(
24629 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
24630 WideTrunc);
24631
24633 SDValue Chain = ST->getChain();
24634 MachineMemOperand *MMO = ST->getMemOperand();
24635 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
24636 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24637 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
24638 TypeSize Offset2 = TypeSize::getFixed(2);
24639 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
24640 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
24641
24642 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24643 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
24644 TypeSize Offset1 = TypeSize::getFixed(1);
24645 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
24646 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
24647
24648 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24649 DAG.getConstant(0, DL, MVT::i64));
24650 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
24651 MF.getMachineMemOperand(MMO, 0, 1));
24652 return Chain;
24653}
24654
24655static unsigned getFPSubregForVT(EVT VT) {
24656 assert(VT.isSimple() && "Expected simple VT");
24657 switch (VT.getSimpleVT().SimpleTy) {
24658 case MVT::aarch64mfp8:
24659 return AArch64::bsub;
24660 case MVT::f16:
24661 return AArch64::hsub;
24662 case MVT::f32:
24663 return AArch64::ssub;
24664 case MVT::f64:
24665 return AArch64::dsub;
24666 default:
24667 llvm_unreachable("Unexpected VT!");
24668 }
24669}
24670
24671static SDValue performSTORECombine(SDNode *N,
24673 SelectionDAG &DAG,
24674 const AArch64Subtarget *Subtarget) {
24676 SDValue Chain = ST->getChain();
24677 SDValue Value = ST->getValue();
24678 SDValue Ptr = ST->getBasePtr();
24679 EVT ValueVT = Value.getValueType();
24680 EVT MemVT = ST->getMemoryVT();
24681 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24682 SDLoc DL(ST);
24683
24684 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24685 return Res;
24686
24687 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
24688 EVT EltVT = VT.getVectorElementType();
24689 return EltVT == MVT::f32 || EltVT == MVT::f64;
24690 };
24691
24692 // Cast ptr32 and ptr64 pointers to the default address space before a store.
24693 unsigned AddrSpace = ST->getAddressSpace();
24694 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24695 AddrSpace == ARM64AS::PTR32_UPTR) {
24696 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24697 if (PtrVT != Ptr.getSimpleValueType()) {
24698 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
24699 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
24700 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
24701 ST->getAAInfo());
24702 }
24703 }
24704
24705 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
24706 return Res;
24707
24708 // If this is an FP_ROUND followed by a store, fold this into a truncating
24709 // store. We can do this even if this is already a truncstore.
24710 // We purposefully don't care about legality of the nodes here as we know
24711 // they can be split down into something legal.
24712 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
24713 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
24714 Subtarget->useSVEForFixedLengthVectors() &&
24715 ValueVT.isFixedLengthVector() &&
24716 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
24717 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
24718 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
24719 ST->getMemOperand());
24720
24721 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
24722 return Split;
24723
24724 if (Subtarget->supportsAddressTopByteIgnored() &&
24725 performTBISimplification(N->getOperand(2), DCI, DAG))
24726 return SDValue(N, 0);
24727
24728 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
24729 return Store;
24730
24731 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
24732 return Store;
24733
24734 if (ST->isTruncatingStore() &&
24735 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
24736 if (SDValue Rshrnb =
24737 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
24738 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24739 MemVT, ST->getMemOperand());
24740 }
24741 }
24742
24743 // This is an integer vector_extract_elt followed by a (possibly truncating)
24744 // store. We may be able to replace this with a store of an FP subregister.
24745 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24746 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24747
24748 SDValue Vector = Value.getOperand(0);
24749 SDValue ExtIdx = Value.getOperand(1);
24750 EVT VectorVT = Vector.getValueType();
24751 EVT ElemVT = VectorVT.getVectorElementType();
24752
24753 if (!ValueVT.isInteger())
24754 return SDValue();
24755
24756 // Propagate zero constants (applying this fold may miss optimizations).
24758 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
24759 DAG.ReplaceAllUsesWith(Value, ZeroElt);
24760 return SDValue();
24761 }
24762
24763 if (ValueVT != MemVT && !ST->isTruncatingStore())
24764 return SDValue();
24765
24766 // This could generate an additional extract if the index is non-zero and
24767 // the extracted value has multiple uses.
24768 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24769 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24770 return SDValue();
24771
24772 // These can lower to st1, which is preferable if we're unlikely to fold the
24773 // addressing into the store.
24774 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24775 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24776 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24777 return SDValue();
24778
24779 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24780 // Heuristic: If there are other users of w/x integer scalars extracted
24781 // from this vector that won't fold into the store -- abandon folding.
24782 // Applying this fold may disrupt paired stores.
24783 for (const auto &Use : Vector->uses()) {
24784 if (Use.getResNo() != Vector.getResNo())
24785 continue;
24786 const SDNode *User = Use.getUser();
24787 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24788 (!User->hasOneUse() ||
24789 (*User->user_begin())->getOpcode() != ISD::STORE))
24790 return SDValue();
24791 }
24792 }
24793
24794 SDValue ExtVector = Vector;
24795 if (!ExtCst || !ExtCst->isZero()) {
24796 // Handle extracting from lanes != 0.
24798 Value.getValueType(), Vector, ExtIdx);
24800 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
24801 DAG.getUNDEF(VectorVT), Ext, Zero);
24802 }
24803
24804 EVT FPMemVT = MemVT == MVT::i8
24805 ? MVT::aarch64mfp8
24807 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24808 FPMemVT, ExtVector);
24809
24810 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
24811 ST->getMemOperand());
24812 }
24813
24814 return SDValue();
24815}
24816
24817static bool
24818isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
24819 if (N->getOpcode() != ISD::CONCAT_VECTORS)
24820 return false;
24821
24822 unsigned NumParts = N->getNumOperands();
24823
24824 // We should be concatenating each sequential result from a
24825 // VECTOR_INTERLEAVE.
24826 SDNode *InterleaveOp = N->getOperand(0).getNode();
24827 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
24828 InterleaveOp->getNumOperands() != NumParts)
24829 return false;
24830
24831 for (unsigned I = 0; I < NumParts; I++)
24832 if (N->getOperand(I) != SDValue(InterleaveOp, I))
24833 return false;
24834
24835 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
24836 return true;
24837}
24838
24839static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
24840 SDValue WideMask,
24841 unsigned RequiredNumParts) {
24842 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
24843 SmallVector<SDValue, 4> MaskInterleaveOps;
24844 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
24845 MaskInterleaveOps))
24846 return SDValue();
24847
24848 if (MaskInterleaveOps.size() != RequiredNumParts)
24849 return SDValue();
24850
24851 // Make sure the inputs to the vector interleave are identical.
24852 if (!llvm::all_equal(MaskInterleaveOps))
24853 return SDValue();
24854
24855 return MaskInterleaveOps[0];
24856 }
24857
24858 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
24859 return SDValue();
24860
24862 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
24863 "Expected element count divisible by number of parts");
24864 EC = EC.divideCoefficientBy(RequiredNumParts);
24865 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
24866 WideMask->getOperand(0));
24867}
24868
24869static SDValue performInterleavedMaskedStoreCombine(
24871 if (!DCI.isBeforeLegalize())
24872 return SDValue();
24873
24875 SDValue WideValue = MST->getValue();
24876
24877 // Bail out if the stored value has an unexpected number of uses, since we'll
24878 // have to perform manual interleaving and may as well just use normal masked
24879 // stores. Also, discard masked stores that are truncating or indexed.
24880 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
24881 !MST->isSimple() || !MST->getOffset().isUndef())
24882 return SDValue();
24883
24884 SmallVector<SDValue, 4> ValueInterleaveOps;
24885 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
24886 ValueInterleaveOps))
24887 return SDValue();
24888
24889 unsigned NumParts = ValueInterleaveOps.size();
24890 if (NumParts != 2 && NumParts != 4)
24891 return SDValue();
24892
24893 // At the moment we're unlikely to see a fixed-width vector interleave as
24894 // we usually generate shuffles instead.
24895 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
24896 if (!SubVecTy.isScalableVT() ||
24897 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
24898 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
24899 return SDValue();
24900
24901 SDLoc DL(N);
24902 SDValue NarrowMask =
24903 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
24904 if (!NarrowMask)
24905 return SDValue();
24906
24907 const Intrinsic::ID IID =
24908 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
24909 SmallVector<SDValue, 8> NewStOps;
24910 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
24911 NewStOps.append(ValueInterleaveOps);
24912 NewStOps.append({NarrowMask, MST->getBasePtr()});
24913 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
24914}
24915
24916static SDValue performMSTORECombine(SDNode *N,
24918 SelectionDAG &DAG,
24919 const AArch64Subtarget *Subtarget) {
24921 SDValue Value = MST->getValue();
24922 SDValue Mask = MST->getMask();
24923 SDLoc DL(N);
24924
24925 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
24926 return Res;
24927
24928 // If this is a UZP1 followed by a masked store, fold this into a masked
24929 // truncating store. We can do this even if this is already a masked
24930 // truncstore.
24931 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24932 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24933 Value.getValueType().isInteger()) {
24934 Value = Value.getOperand(0);
24935 if (Value.getOpcode() == ISD::BITCAST) {
24936 EVT HalfVT =
24937 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
24938 EVT InVT = Value.getOperand(0).getValueType();
24939
24940 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
24941 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24942 unsigned PgPattern = Mask->getConstantOperandVal(0);
24943
24944 // Ensure we can double the size of the predicate pattern
24945 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24946 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24947 MinSVESize) {
24948 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
24949 PgPattern);
24950 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
24951 MST->getBasePtr(), MST->getOffset(), Mask,
24952 MST->getMemoryVT(), MST->getMemOperand(),
24953 MST->getAddressingMode(),
24954 /*IsTruncating=*/true);
24955 }
24956 }
24957 }
24958 }
24959
24960 if (MST->isTruncatingStore()) {
24961 EVT ValueVT = Value->getValueType(0);
24962 EVT MemVT = MST->getMemoryVT();
24963 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
24964 return SDValue();
24965 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24966 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24967 MST->getOffset(), MST->getMask(),
24968 MST->getMemoryVT(), MST->getMemOperand(),
24969 MST->getAddressingMode(), true);
24970 }
24971 }
24972
24973 return SDValue();
24974}
24975
24976/// \return true if part of the index was folded into the Base.
24977static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24978 SDLoc DL, SelectionDAG &DAG) {
24979 // This function assumes a vector of i64 indices.
24980 EVT IndexVT = Index.getValueType();
24981 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24982 return false;
24983
24984 // Simplify:
24985 // BasePtr = Ptr
24986 // Index = X + splat(Offset)
24987 // ->
24988 // BasePtr = Ptr + Offset * scale.
24989 // Index = X
24990 if (Index.getOpcode() == ISD::ADD) {
24991 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24992 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24993 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24994 Index = Index.getOperand(0);
24995 return true;
24996 }
24997 }
24998
24999 // Simplify:
25000 // BasePtr = Ptr
25001 // Index = (X + splat(Offset)) << splat(Shift)
25002 // ->
25003 // BasePtr = Ptr + (Offset << Shift) * scale)
25004 // Index = X << splat(shift)
25005 if (Index.getOpcode() == ISD::SHL &&
25006 Index.getOperand(0).getOpcode() == ISD::ADD) {
25007 SDValue Add = Index.getOperand(0);
25008 SDValue ShiftOp = Index.getOperand(1);
25009 SDValue OffsetOp = Add.getOperand(1);
25010 if (auto Shift = DAG.getSplatValue(ShiftOp))
25011 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
25012 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
25013 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
25014 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
25015 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
25016 Add.getOperand(0), ShiftOp);
25017 return true;
25018 }
25019 }
25020
25021 return false;
25022}
25023
25024// Analyse the specified address returning true if a more optimal addressing
25025// mode is available. When returning true all parameters are updated to reflect
25026// their recommended values.
25028 SDValue &BasePtr, SDValue &Index,
25029 SelectionDAG &DAG) {
25030 // Try to iteratively fold parts of the index into the base pointer to
25031 // simplify the index as much as possible.
25032 bool Changed = false;
25033 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
25034 Changed = true;
25035
25036 // Only consider element types that are pointer sized as smaller types can
25037 // be easily promoted.
25038 EVT IndexVT = Index.getValueType();
25039 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
25040 return Changed;
25041
25042 // Can indices be trivially shrunk?
25043 EVT DataVT = N->getOperand(1).getValueType();
25044 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
25045 // will later be re-extended to 64 bits in legalization
25046 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
25047 return Changed;
25048 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
25049 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25050 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
25051 return true;
25052 }
25053
25054 // Match:
25055 // Index = step(const)
25056 int64_t Stride = 0;
25057 if (Index.getOpcode() == ISD::STEP_VECTOR) {
25058 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
25059 }
25060 // Match:
25061 // Index = step(const) << shift(const)
25062 else if (Index.getOpcode() == ISD::SHL &&
25063 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
25064 SDValue RHS = Index.getOperand(1);
25065 if (auto *Shift =
25067 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
25068 Stride = Step << Shift->getZExtValue();
25069 }
25070 }
25071
25072 // Return early because no supported pattern is found.
25073 if (Stride == 0)
25074 return Changed;
25075
25076 if (Stride < std::numeric_limits<int32_t>::min() ||
25077 Stride > std::numeric_limits<int32_t>::max())
25078 return Changed;
25079
25080 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25081 unsigned MaxVScale =
25083 int64_t LastElementOffset =
25084 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
25085
25086 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
25087 LastElementOffset > std::numeric_limits<int32_t>::max())
25088 return Changed;
25089
25090 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25091 // Stride does not scale explicitly by 'Scale', because it happens in
25092 // the gather/scatter addressing mode.
25093 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
25094 return true;
25095}
25096
25099 if (!DCI.isBeforeLegalize())
25100 return SDValue();
25102
25103 SDLoc DL(MGS);
25104 SDValue Chain = MGS->getChain();
25105 SDValue Scale = MGS->getScale();
25106 SDValue Index = MGS->getIndex();
25107 SDValue Mask = MGS->getMask();
25108 SDValue BasePtr = MGS->getBasePtr();
25109 ISD::MemIndexType IndexType = MGS->getIndexType();
25110
25111 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
25112 return SDValue();
25113
25114 // Here we catch such cases early and change MGATHER's IndexType to allow
25115 // the use of an Index that's more legalisation friendly.
25116 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
25117 SDValue PassThru = MGT->getPassThru();
25118 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
25119 return DAG.getMaskedGather(
25120 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
25121 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
25122 }
25123 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
25124 SDValue Data = MSC->getValue();
25125 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
25126 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
25127 DL, Ops, MSC->getMemOperand(), IndexType,
25128 MSC->isTruncatingStore());
25129 }
25130 auto *HG = cast<MaskedHistogramSDNode>(MGS);
25131 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
25132 Index, Scale, HG->getIntID()};
25133 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
25134 DL, Ops, HG->getMemOperand(), IndexType);
25135}
25136
25137/// Target-specific DAG combine function for NEON load/store intrinsics
25138/// to merge base address updates.
25141 SelectionDAG &DAG) {
25142 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
25143 return SDValue();
25144
25145 unsigned AddrOpIdx = N->getNumOperands() - 1;
25146 SDValue Addr = N->getOperand(AddrOpIdx);
25147
25148 // Search for a use of the address operand that is an increment.
25149 for (SDUse &Use : Addr->uses()) {
25150 SDNode *User = Use.getUser();
25151 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
25152 continue;
25153
25154 // Check that the add is independent of the load/store. Otherwise, folding
25155 // it would create a cycle.
25158 Visited.insert(Addr.getNode());
25159 Worklist.push_back(N);
25160 Worklist.push_back(User);
25161 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
25162 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25163 continue;
25164
25165 // Find the new opcode for the updating load/store.
25166 bool IsStore = false;
25167 bool IsLaneOp = false;
25168 bool IsDupOp = false;
25169 unsigned NewOpc = 0;
25170 unsigned NumVecs = 0;
25171 unsigned IntNo = N->getConstantOperandVal(1);
25172 switch (IntNo) {
25173 default: llvm_unreachable("unexpected intrinsic for Neon base update");
25174 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
25175 NumVecs = 2; break;
25176 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
25177 NumVecs = 3; break;
25178 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
25179 NumVecs = 4; break;
25180 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
25181 NumVecs = 2; IsStore = true; break;
25182 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
25183 NumVecs = 3; IsStore = true; break;
25184 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
25185 NumVecs = 4; IsStore = true; break;
25186 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
25187 NumVecs = 2; break;
25188 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
25189 NumVecs = 3; break;
25190 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
25191 NumVecs = 4; break;
25192 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
25193 NumVecs = 2; IsStore = true; break;
25194 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
25195 NumVecs = 3; IsStore = true; break;
25196 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
25197 NumVecs = 4; IsStore = true; break;
25198 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
25199 NumVecs = 2; IsDupOp = true; break;
25200 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
25201 NumVecs = 3; IsDupOp = true; break;
25202 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
25203 NumVecs = 4; IsDupOp = true; break;
25204 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
25205 NumVecs = 2; IsLaneOp = true; break;
25206 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
25207 NumVecs = 3; IsLaneOp = true; break;
25208 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
25209 NumVecs = 4; IsLaneOp = true; break;
25210 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
25211 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
25212 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
25213 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
25214 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
25215 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
25216 }
25217
25218 EVT VecTy;
25219 if (IsStore)
25220 VecTy = N->getOperand(2).getValueType();
25221 else
25222 VecTy = N->getValueType(0);
25223
25224 // If the increment is a constant, it must match the memory ref size.
25225 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25226 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25227 uint32_t IncVal = CInc->getZExtValue();
25228 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
25229 if (IsLaneOp || IsDupOp)
25230 NumBytes /= VecTy.getVectorNumElements();
25231 if (IncVal != NumBytes)
25232 continue;
25233 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25234 }
25236 Ops.push_back(N->getOperand(0)); // Incoming chain
25237 // Load lane and store have vector list as input.
25238 if (IsLaneOp || IsStore)
25239 for (unsigned i = 2; i < AddrOpIdx; ++i)
25240 Ops.push_back(N->getOperand(i));
25241 Ops.push_back(Addr); // Base register
25242 Ops.push_back(Inc);
25243
25244 // Return Types.
25245 EVT Tys[6];
25246 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
25247 unsigned n;
25248 for (n = 0; n < NumResultVecs; ++n)
25249 Tys[n] = VecTy;
25250 Tys[n++] = MVT::i64; // Type of write back register
25251 Tys[n] = MVT::Other; // Type of the chain
25252 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
25253
25255 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
25256 MemInt->getMemoryVT(),
25257 MemInt->getMemOperand());
25258
25259 // Update the uses.
25260 std::vector<SDValue> NewResults;
25261 for (unsigned i = 0; i < NumResultVecs; ++i) {
25262 NewResults.push_back(SDValue(UpdN.getNode(), i));
25263 }
25264 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
25265 DCI.CombineTo(N, NewResults);
25266 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
25267
25268 break;
25269 }
25270 return SDValue();
25271}
25272
25273// Checks to see if the value is the prescribed width and returns information
25274// about its extension mode.
25275static
25276bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
25277 ExtType = ISD::NON_EXTLOAD;
25278 switch(V.getNode()->getOpcode()) {
25279 default:
25280 return false;
25281 case ISD::LOAD: {
25282 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
25283 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
25284 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25285 ExtType = LoadNode->getExtensionType();
25286 return true;
25287 }
25288 return false;
25289 }
25290 case ISD::AssertSext: {
25291 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25292 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25293 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25294 ExtType = ISD::SEXTLOAD;
25295 return true;
25296 }
25297 return false;
25298 }
25299 case ISD::AssertZext: {
25300 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25301 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25302 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25303 ExtType = ISD::ZEXTLOAD;
25304 return true;
25305 }
25306 return false;
25307 }
25308 case ISD::Constant:
25309 case ISD::TargetConstant: {
25310 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25311 1LL << (width - 1);
25312 }
25313 }
25314
25315 return true;
25316}
25317
25318// This function does a whole lot of voodoo to determine if the tests are
25319// equivalent without and with a mask. Essentially what happens is that given a
25320// DAG resembling:
25321//
25322// +-------------+ +-------------+ +-------------+ +-------------+
25323// | Input | | AddConstant | | CompConstant| | CC |
25324// +-------------+ +-------------+ +-------------+ +-------------+
25325// | | | |
25326// V V | +----------+
25327// +-------------+ +----+ | |
25328// | ADD | |0xff| | |
25329// +-------------+ +----+ | |
25330// | | | |
25331// V V | |
25332// +-------------+ | |
25333// | AND | | |
25334// +-------------+ | |
25335// | | |
25336// +-----+ | |
25337// | | |
25338// V V V
25339// +-------------+
25340// | CMP |
25341// +-------------+
25342//
25343// The AND node may be safely removed for some combinations of inputs. In
25344// particular we need to take into account the extension type of the Input,
25345// the exact values of AddConstant, CompConstant, and CC, along with the nominal
25346// width of the input (this can work for any width inputs, the above graph is
25347// specific to 8 bits.
25348//
25349// The specific equations were worked out by generating output tables for each
25350// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
25351// problem was simplified by working with 4 bit inputs, which means we only
25352// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
25353// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
25354// patterns present in both extensions (0,7). For every distinct set of
25355// AddConstant and CompConstants bit patterns we can consider the masked and
25356// unmasked versions to be equivalent if the result of this function is true for
25357// all 16 distinct bit patterns of for the current extension type of Input (w0).
25358//
25359// sub w8, w0, w1
25360// and w10, w8, #0x0f
25361// cmp w8, w2
25362// cset w9, AArch64CC
25363// cmp w10, w2
25364// cset w11, AArch64CC
25365// cmp w9, w11
25366// cset w0, eq
25367// ret
25368//
25369// Since the above function shows when the outputs are equivalent it defines
25370// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
25371// would be expensive to run during compiles. The equations below were written
25372// in a test harness that confirmed they gave equivalent outputs to the above
25373// for all inputs function, so they can be used determine if the removal is
25374// legal instead.
25375//
25376// isEquivalentMaskless() is the code for testing if the AND can be removed
25377// factored out of the DAG recognition as the DAG can take several forms.
25378
25379static bool isEquivalentMaskless(unsigned CC, unsigned width,
25380 ISD::LoadExtType ExtType, int AddConstant,
25381 int CompConstant) {
25382 // By being careful about our equations and only writing the in term
25383 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
25384 // make them generally applicable to all bit widths.
25385 int MaxUInt = (1 << width);
25386
25387 // For the purposes of these comparisons sign extending the type is
25388 // equivalent to zero extending the add and displacing it by half the integer
25389 // width. Provided we are careful and make sure our equations are valid over
25390 // the whole range we can just adjust the input and avoid writing equations
25391 // for sign extended inputs.
25392 if (ExtType == ISD::SEXTLOAD)
25393 AddConstant -= (1 << (width-1));
25394
25395 switch(CC) {
25396 case AArch64CC::LE:
25397 case AArch64CC::GT:
25398 if ((AddConstant == 0) ||
25399 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
25400 (AddConstant >= 0 && CompConstant < 0) ||
25401 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
25402 return true;
25403 break;
25404 case AArch64CC::LT:
25405 case AArch64CC::GE:
25406 if ((AddConstant == 0) ||
25407 (AddConstant >= 0 && CompConstant <= 0) ||
25408 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
25409 return true;
25410 break;
25411 case AArch64CC::HI:
25412 case AArch64CC::LS:
25413 if ((AddConstant >= 0 && CompConstant < 0) ||
25414 (AddConstant <= 0 && CompConstant >= -1 &&
25415 CompConstant < AddConstant + MaxUInt))
25416 return true;
25417 break;
25418 case AArch64CC::PL:
25419 case AArch64CC::MI:
25420 if ((AddConstant == 0) ||
25421 (AddConstant > 0 && CompConstant <= 0) ||
25422 (AddConstant < 0 && CompConstant <= AddConstant))
25423 return true;
25424 break;
25425 case AArch64CC::LO:
25426 case AArch64CC::HS:
25427 if ((AddConstant >= 0 && CompConstant <= 0) ||
25428 (AddConstant <= 0 && CompConstant >= 0 &&
25429 CompConstant <= AddConstant + MaxUInt))
25430 return true;
25431 break;
25432 case AArch64CC::EQ:
25433 case AArch64CC::NE:
25434 if ((AddConstant > 0 && CompConstant < 0) ||
25435 (AddConstant < 0 && CompConstant >= 0 &&
25436 CompConstant < AddConstant + MaxUInt) ||
25437 (AddConstant >= 0 && CompConstant >= 0 &&
25438 CompConstant >= AddConstant) ||
25439 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
25440 return true;
25441 break;
25442 case AArch64CC::VS:
25443 case AArch64CC::VC:
25444 case AArch64CC::AL:
25445 case AArch64CC::NV:
25446 return true;
25447 case AArch64CC::Invalid:
25448 break;
25449 }
25450
25451 return false;
25452}
25453
25454// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
25455// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
25457 SDNode *AndNode, SelectionDAG &DAG,
25458 unsigned CCIndex, unsigned CmpIndex,
25459 unsigned CC) {
25460 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
25461 if (!SubsC)
25462 return SDValue();
25463
25464 APInt SubsAP = SubsC->getAPIntValue();
25465 if (CC == AArch64CC::HI) {
25466 if (!SubsAP.isMask())
25467 return SDValue();
25468 } else if (CC == AArch64CC::LO) {
25469 if (!SubsAP.isPowerOf2())
25470 return SDValue();
25471 } else
25472 return SDValue();
25473
25475 if (!AndC)
25476 return SDValue();
25477
25478 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
25479
25480 SDLoc DL(N);
25481 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
25482 SDValue ANDS = DAG.getNode(
25483 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
25484 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
25485 SDValue AArch64_CC =
25487 N->getOperand(CCIndex)->getValueType(0));
25488
25489 // For now, only performCSELCombine and performBRCONDCombine call this
25490 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
25491 // operands. So just init the ops direct to simplify the code. If we have some
25492 // other case with different CCIndex, CmpIndex, we need to use for loop to
25493 // rewrite the code here.
25494 // TODO: Do we need to assert number of operand is 4 here?
25495 assert((CCIndex == 2 && CmpIndex == 3) &&
25496 "Expected CCIndex to be 2 and CmpIndex to be 3.");
25497 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
25498 ANDS.getValue(1)};
25499 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
25500}
25501
25502static
25505 SelectionDAG &DAG, unsigned CCIndex,
25506 unsigned CmpIndex) {
25507 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
25508 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
25509 unsigned CondOpcode = SubsNode->getOpcode();
25510
25511 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
25512 !SubsNode->hasOneUse())
25513 return SDValue();
25514
25515 // There is a SUBS feeding this condition. Is it fed by a mask we can
25516 // use?
25517
25518 SDNode *AndNode = SubsNode->getOperand(0).getNode();
25519 unsigned MaskBits = 0;
25520
25521 if (AndNode->getOpcode() != ISD::AND)
25522 return SDValue();
25523
25524 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
25525 CmpIndex, CC))
25526 return Val;
25527
25528 // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
25529 // sequence of ones starting at the least significant bit with the remainder
25530 // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
25531 // into a SUBS (immediate). The transformed form can be matched into a SUBS
25532 // (shifted register).
25533 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
25534 isa<ConstantSDNode>(AndNode->getOperand(1)) &&
25535 isa<ConstantSDNode>(SubsNode->getOperand(1))) {
25536 SDValue X = AndNode->getOperand(0);
25537 APInt M = AndNode->getConstantOperandAPInt(1);
25538 APInt C = SubsNode->getConstantOperandAPInt(1);
25539
25540 if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
25541 SDLoc DL(SubsNode);
25542 EVT VT = SubsNode->getValueType(0);
25543 unsigned ShiftAmt = M.countl_zero();
25544 SDValue ShiftedX = DAG.getNode(
25545 ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
25546 SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
25547 SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
25548 ShiftedC, ShiftedX);
25549 DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
25550 return SDValue(N, 0);
25551 }
25552 }
25553
25554 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
25555 uint32_t CNV = CN->getZExtValue();
25556 if (CNV == 255)
25557 MaskBits = 8;
25558 else if (CNV == 65535)
25559 MaskBits = 16;
25560 }
25561
25562 if (!MaskBits)
25563 return SDValue();
25564
25565 SDValue AddValue = AndNode->getOperand(0);
25566
25567 if (AddValue.getOpcode() != ISD::ADD)
25568 return SDValue();
25569
25570 // The basic dag structure is correct, grab the inputs and validate them.
25571
25572 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
25573 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
25574 SDValue SubsInputValue = SubsNode->getOperand(1);
25575
25576 // The mask is present and the provenance of all the values is a smaller type,
25577 // lets see if the mask is superfluous.
25578
25579 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
25580 !isa<ConstantSDNode>(SubsInputValue.getNode()))
25581 return SDValue();
25582
25583 ISD::LoadExtType ExtType;
25584
25585 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
25586 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
25587 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
25588 return SDValue();
25589
25590 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
25591 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
25592 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
25593 return SDValue();
25594
25595 // The AND is not necessary, remove it.
25596
25597 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
25598 SubsNode->getValueType(1));
25599 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
25600
25601 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
25602 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
25603
25604 return SDValue(N, 0);
25605}
25606
25607// Optimize compare with zero and branch.
25610 SelectionDAG &DAG) {
25612 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
25613 // will not be produced, as they are conditional branch instructions that do
25614 // not set flags.
25615 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
25616 return SDValue();
25617
25618 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
25619 N = NV.getNode();
25620 SDValue Chain = N->getOperand(0);
25621 SDValue Dest = N->getOperand(1);
25622 SDValue CCVal = N->getOperand(2);
25623 SDValue Cmp = N->getOperand(3);
25624
25625 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
25626 unsigned CC = CCVal->getAsZExtVal();
25627 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
25628 return SDValue();
25629
25630 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
25631 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
25632 SDValue CSel = Cmp.getOperand(0);
25633 auto CSelCC = getCSETCondCode(CSel);
25634 if (CSelCC) {
25635 SDLoc DL(N);
25636 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
25637 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
25638 CSel.getOperand(3));
25639 }
25640 }
25641
25642 unsigned CmpOpc = Cmp.getOpcode();
25643 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
25644 return SDValue();
25645
25646 // Only attempt folding if there is only one use of the flag and no use of the
25647 // value.
25648 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
25649 return SDValue();
25650
25651 SDValue LHS = Cmp.getOperand(0);
25652 SDValue RHS = Cmp.getOperand(1);
25653
25654 assert(LHS.getValueType() == RHS.getValueType() &&
25655 "Expected the value type to be the same for both operands!");
25656 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
25657 return SDValue();
25658
25659 if (isNullConstant(LHS))
25660 std::swap(LHS, RHS);
25661
25662 if (!isNullConstant(RHS))
25663 return SDValue();
25664
25665 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
25666 LHS.getOpcode() == ISD::SRL)
25667 return SDValue();
25668
25669 // Fold the compare into the branch instruction.
25670 SDValue BR;
25671 if (CC == AArch64CC::EQ)
25672 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25673 else
25674 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25675
25676 // Do not add new nodes to DAG combiner worklist.
25677 DCI.CombineTo(N, BR, false);
25678
25679 return SDValue();
25680}
25681
25683 unsigned CC = N->getConstantOperandVal(2);
25684 SDValue SUBS = N->getOperand(3);
25685 SDValue Zero, CTTZ;
25686
25687 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
25688 Zero = N->getOperand(0);
25689 CTTZ = N->getOperand(1);
25690 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
25691 Zero = N->getOperand(1);
25692 CTTZ = N->getOperand(0);
25693 } else
25694 return SDValue();
25695
25696 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
25697 (CTTZ.getOpcode() == ISD::TRUNCATE &&
25698 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
25699 return SDValue();
25700
25701 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
25702 "Illegal type in CTTZ folding");
25703
25704 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
25705 return SDValue();
25706
25707 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
25708 ? CTTZ.getOperand(0).getOperand(0)
25709 : CTTZ.getOperand(0);
25710
25711 if (X != SUBS.getOperand(0))
25712 return SDValue();
25713
25714 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
25715 ? CTTZ.getOperand(0).getValueSizeInBits()
25716 : CTTZ.getValueSizeInBits();
25717 SDValue BitWidthMinusOne =
25718 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
25719 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
25720 BitWidthMinusOne);
25721}
25722
25723// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
25724// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
25725// Where x and y are constants and x != y
25726
25727// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
25728// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
25729// Where x and y are constants and x != y
25731 SDValue L = Op->getOperand(0);
25732 SDValue R = Op->getOperand(1);
25733 AArch64CC::CondCode OpCC =
25734 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25735
25736 SDValue OpCmp = Op->getOperand(3);
25737 if (!isCMP(OpCmp))
25738 return SDValue();
25739
25740 SDValue CmpLHS = OpCmp.getOperand(0);
25741 SDValue CmpRHS = OpCmp.getOperand(1);
25742
25743 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
25744 std::swap(CmpLHS, CmpRHS);
25745 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
25746 return SDValue();
25747
25748 SDValue X = CmpLHS->getOperand(0);
25749 SDValue Y = CmpLHS->getOperand(1);
25750 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
25751 return SDValue();
25752 }
25753
25754 // If one of the constant is opaque constant, x,y sdnode is still different
25755 // but the real value maybe the same. So check APInt here to make sure the
25756 // code is correct.
25759 if (CX->getAPIntValue() == CY->getAPIntValue())
25760 return SDValue();
25761
25763 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
25764 SDValue Cond = CmpLHS->getOperand(3);
25765
25766 if (CmpRHS == Y)
25768 else if (CmpRHS != X)
25769 return SDValue();
25770
25771 if (OpCC == AArch64CC::NE)
25773 else if (OpCC != AArch64CC::EQ)
25774 return SDValue();
25775
25776 SDLoc DL(Op);
25777 EVT VT = Op->getValueType(0);
25778
25779 SDValue CCValue = getCondCode(DAG, CC);
25780 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
25781}
25782
25783// Reassociate the true/false expressions of a CSEL instruction to obtain a
25784// common subexpression with the comparison instruction. For example, change
25785// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
25786// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
25787// subexpression.
25789 SDValue SubsNode = N->getOperand(3);
25790 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
25791 return SDValue();
25792
25793 SDValue CmpOpToMatch = SubsNode.getOperand(1);
25794 SDValue CmpOpOther = SubsNode.getOperand(0);
25795 EVT VT = N->getValueType(0);
25796
25797 unsigned ExpectedOpcode;
25798 SDValue ExpectedOp;
25799 SDValue SubsOp;
25800 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
25801 if (CmpOpConst) {
25802 ExpectedOpcode = ISD::ADD;
25803 ExpectedOp =
25804 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25805 CmpOpConst->getValueType(0));
25806 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25807 CmpOpConst->getValueType(0));
25808 } else {
25809 ExpectedOpcode = ISD::SUB;
25810 ExpectedOp = CmpOpToMatch;
25811 SubsOp = CmpOpToMatch;
25812 }
25813
25814 // Get the operand that can be reassociated with the SUBS instruction.
25815 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
25816 if (Op.getOpcode() != ExpectedOpcode)
25817 return SDValue();
25818 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
25819 !Op.getOperand(0).hasOneUse())
25820 return SDValue();
25821 SDValue X = Op.getOperand(0).getOperand(0);
25822 SDValue Y = Op.getOperand(0).getOperand(1);
25823 if (X != CmpOpOther)
25824 std::swap(X, Y);
25825 if (X != CmpOpOther)
25826 return SDValue();
25827 if (ExpectedOp != Op.getOperand(1))
25828 return SDValue();
25829 return Y;
25830 };
25831
25832 // Try the reassociation using the given constant and condition code.
25833 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
25834 SDValue SubsOp) {
25835 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
25836 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
25837 if (!TReassocOp && !FReassocOp)
25838 return SDValue();
25839
25840 SDValue NewCmp =
25841 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
25842 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
25843
25844 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
25845 if (!ReassocOp)
25846 return N->getOperand(OpNum);
25847 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
25848 NewCmp.getValue(0), ReassocOp);
25849 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
25850 return Res;
25851 };
25852
25853 SDValue TValReassoc = Reassociate(TReassocOp, 0);
25854 SDValue FValReassoc = Reassociate(FReassocOp, 1);
25855 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
25856 getCondCode(DAG, NewCC), NewCmp.getValue(1));
25857 };
25858
25859 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25860
25861 // First, try to eliminate the compare instruction by searching for a
25862 // subtraction with the same constant.
25863 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
25864 return R;
25865
25866 if (!CmpOpConst) {
25867 // Try again with the operands of the SUBS instruction and the condition
25868 // swapped. Due to canonicalization, this only helps for non-constant
25869 // operands of the SUBS instruction.
25870 std::swap(CmpOpToMatch, CmpOpOther);
25871 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
25872 return R;
25873 return SDValue();
25874 }
25875
25876 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
25877 return SDValue();
25878
25879 // Next, search for a subtraction with a slightly different constant. By
25880 // adjusting the condition code, we can still eliminate the compare
25881 // instruction. Adjusting the constant is only valid if it does not result
25882 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
25883 // Since such comparisons are trivially true/false, we should not encounter
25884 // them here but check for them nevertheless to be on the safe side.
25885 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
25886 AArch64CC::CondCode NewCC) {
25887 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
25888 CmpOpConst->getValueType(0));
25889 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
25890 CmpOpConst->getValueType(0));
25891 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25892 };
25893 switch (CC) {
25894 case AArch64CC::EQ:
25895 case AArch64CC::LS:
25896 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25897 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25898 case AArch64CC::NE:
25899 case AArch64CC::HI:
25900 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25901 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25902 case AArch64CC::LO:
25903 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25904 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25905 case AArch64CC::HS:
25906 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25907 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25908 case AArch64CC::LT:
25909 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25910 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25911 case AArch64CC::LE:
25912 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25913 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25914 case AArch64CC::GT:
25915 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25916 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25917 case AArch64CC::GE:
25918 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25919 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25920 default:
25921 return SDValue();
25922 }
25923}
25924
25926 AArch64CC::CondCode OpCC =
25927 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25928
25929 if (OpCC != AArch64CC::NE)
25930 return SDValue();
25931
25932 SDValue PTest = Op->getOperand(3);
25933 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25934 return SDValue();
25935
25936 SDValue TruePred = PTest.getOperand(0);
25937 SDValue AnyPred = PTest.getOperand(1);
25938
25939 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25940 TruePred = TruePred.getOperand(0);
25941
25942 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25943 AnyPred = AnyPred.getOperand(0);
25944
25945 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
25946 return SDValue();
25947
25948 SDValue LastB = Op->getOperand(0);
25949 SDValue Default = Op->getOperand(1);
25950
25951 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
25952 return SDValue();
25953
25954 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
25955 AnyPred, Default, LastB.getOperand(1));
25956}
25957
25958// Optimize CSEL instructions
25961 SelectionDAG &DAG) {
25962 // CSEL x, x, cc -> x
25963 if (N->getOperand(0) == N->getOperand(1))
25964 return N->getOperand(0);
25965
25966 if (SDValue R = foldCSELOfCSEL(N, DAG))
25967 return R;
25968
25969 // Try to reassociate the true/false expressions so that we can do CSE with
25970 // a SUBS instruction used to perform the comparison.
25972 return R;
25973
25974 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25975 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25976 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25977 return Folded;
25978
25979 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25980 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25981 SDValue Cond = N->getOperand(3);
25982 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25983 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
25984 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25985 {Cond.getOperand(1), Cond.getOperand(0)}) &&
25986 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25987 {Cond.getOperand(0), Cond.getOperand(1)}) &&
25988 !isNullConstant(Cond.getOperand(1))) {
25989 AArch64CC::CondCode OldCond =
25990 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25991 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
25992 if (NewCond != AArch64CC::AL) {
25993 SDLoc DL(N);
25994 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25995 Cond.getOperand(1), Cond.getOperand(0));
25996 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
25997 N->getOperand(1), getCondCode(DAG, NewCond),
25998 Sub.getValue(1));
25999 }
26000 }
26001
26002 // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
26003 // use overflow flags, to avoid the comparison with zero. In case of success,
26004 // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
26005 // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
26006 // nodes with their SUBS equivalent as is already done for other flag-setting
26007 // operators, in which case doing the replacement here becomes redundant.
26008 if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
26009 isNullConstant(Cond.getOperand(1))) {
26010 SDValue Sub = Cond.getOperand(0);
26012 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
26013 if (Sub.getOpcode() == ISD::SUB &&
26014 (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
26015 CC == AArch64CC::PL)) {
26016 SDLoc DL(N);
26017 SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
26018 Sub.getOperand(0), Sub.getOperand(1));
26019 DCI.CombineTo(Sub.getNode(), Subs);
26020 DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
26021 return SDValue(N, 0);
26022 }
26023 }
26024
26025 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
26026 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
26027 return CondLast;
26028
26029 return performCONDCombine(N, DCI, DAG, 2, 3);
26030}
26031
26032// Try to re-use an already extended operand of a vector SetCC feeding a
26033// extended select. Doing so avoids requiring another full extension of the
26034// SET_CC result when lowering the select.
26036 EVT Op0MVT = Op->getOperand(0).getValueType();
26037 if (!Op0MVT.isVector() || Op->use_empty())
26038 return SDValue();
26039
26040 // Make sure that all uses of Op are VSELECTs with result matching types where
26041 // the result type has a larger element type than the SetCC operand.
26042 SDNode *FirstUse = *Op->user_begin();
26043 if (FirstUse->getOpcode() != ISD::VSELECT)
26044 return SDValue();
26045 EVT UseMVT = FirstUse->getValueType(0);
26046 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
26047 return SDValue();
26048 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
26049 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
26050 }))
26051 return SDValue();
26052
26053 APInt V;
26054 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
26055 return SDValue();
26056
26057 SDLoc DL(Op);
26058 SDValue Op0ExtV;
26059 SDValue Op1ExtV;
26060 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
26061 // Check if the first operand of the SET_CC is already extended. If it is,
26062 // split the SET_CC and re-use the extended version of the operand.
26063 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
26064 Op->getOperand(0));
26065 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
26066 Op->getOperand(0));
26067 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26068 Op0ExtV = SDValue(Op0SExt, 0);
26069 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
26070 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26071 Op0ExtV = SDValue(Op0ZExt, 0);
26072 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
26073 } else
26074 return SDValue();
26075
26076 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
26077 Op0ExtV, Op1ExtV, Op->getOperand(2));
26078}
26079
26080static SDValue
26082 SelectionDAG &DAG) {
26083 SDValue Vec = N->getOperand(0);
26084 if (DCI.isBeforeLegalize() &&
26085 Vec.getValueType().getVectorElementType() == MVT::i1 &&
26088 SDLoc DL(N);
26089 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
26090 DAG);
26091 }
26092
26093 return SDValue();
26094}
26095
26098 SelectionDAG &DAG) {
26099 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
26100 SDValue LHS = N->getOperand(0);
26101 SDValue RHS = N->getOperand(1);
26102 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
26103 SDLoc DL(N);
26104 EVT VT = N->getValueType(0);
26105
26106 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
26107 return V;
26108
26109 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
26110 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
26111 LHS->getOpcode() == AArch64ISD::CSEL &&
26112 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
26113 LHS->hasOneUse()) {
26114 // Invert CSEL's condition.
26115 auto OldCond =
26116 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
26117 auto NewCond = getInvertedCondCode(OldCond);
26118
26119 // csel 0, 1, !cond, X
26120 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
26121 LHS.getOperand(0), LHS.getOperand(1),
26122 getCondCode(DAG, NewCond), LHS.getOperand(3));
26123 return DAG.getZExtOrTrunc(CSEL, DL, VT);
26124 }
26125
26126 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
26127 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
26128 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
26129 LHS->hasOneUse()) {
26130 EVT TstVT = LHS->getValueType(0);
26131 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
26132 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
26133 // this pattern will get better opt in emitComparison
26134 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
26135 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
26136 DAG.getSignedConstant(TstImm, DL, TstVT));
26137 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
26138 }
26139 }
26140
26141 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
26142 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
26143 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
26144 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
26145 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
26146 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
26148 LHS->getOpcode() == ISD::BITCAST) {
26149 EVT ToVT = LHS->getValueType(0);
26150 EVT FromVT = LHS->getOperand(0).getValueType();
26151 if (FromVT.isFixedLengthVector() &&
26152 FromVT.getVectorElementType() == MVT::i1) {
26153 bool IsNull = isNullConstant(RHS);
26154 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
26155 DL, MVT::i1, LHS->getOperand(0));
26156 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
26157 LHS);
26158 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
26159 }
26160 }
26161
26162 // Try to perform the memcmp when the result is tested for [in]equality with 0
26163 if (SDValue V = performOrXorChainCombine(N, DAG))
26164 return V;
26165
26166 EVT CmpVT = LHS.getValueType();
26167
26168 // NOTE: This exists as a combine only because it proved too awkward to match
26169 // splat(1) across all the NEON types during isel.
26170 APInt SplatLHSVal;
26171 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
26172 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
26173 SplatLHSVal.isOne())
26174 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
26175
26176 return SDValue();
26177}
26178
26179// Replace a flag-setting operator (eg ANDS) with the generic version
26180// (eg AND) if the flag is unused.
26183 unsigned GenericOpcode) {
26184 SDLoc DL(N);
26185 SDValue LHS = N->getOperand(0);
26186 SDValue RHS = N->getOperand(1);
26187 EVT VT = N->getValueType(0);
26188
26189 // If the flag result isn't used, convert back to a generic opcode.
26190 if (!N->hasAnyUseOfValue(1)) {
26191 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
26192 return DCI.CombineTo(N, Res, SDValue(N, 1));
26193 }
26194
26195 // Combine identical generic nodes into this node, re-using the result.
26196 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
26197 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
26198 DCI.CombineTo(Generic, SDValue(N, 0));
26199
26200 return SDValue();
26201}
26202
26204 // setcc_merge_zero pred
26205 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
26206 // => extract_subvector (inner setcc_merge_zero)
26207 SDValue Pred = N->getOperand(0);
26208 SDValue LHS = N->getOperand(1);
26209 SDValue RHS = N->getOperand(2);
26210 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26211
26212 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
26213 LHS->getOpcode() != ISD::SIGN_EXTEND)
26214 return SDValue();
26215
26216 SDValue Extract = LHS->getOperand(0);
26217 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26218 Extract->getValueType(0) != N->getValueType(0) ||
26219 Extract->getConstantOperandVal(1) != 0)
26220 return SDValue();
26221
26222 SDValue InnerSetCC = Extract->getOperand(0);
26223 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
26224 return SDValue();
26225
26226 // By this point we've effectively got
26227 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
26228 // lanes are already zero then the trunc(sext()) sequence is redundant and we
26229 // can operate on A directly.
26230 SDValue InnerPred = InnerSetCC.getOperand(0);
26231 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
26232 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
26233 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
26234 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
26235 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
26236 return Extract;
26237
26238 return SDValue();
26239}
26240
26241static bool isSignExtInReg(const SDValue &V) {
26242 if (V.getOpcode() != AArch64ISD::VASHR ||
26243 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
26244 return false;
26245
26246 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
26247 unsigned ShiftAmtR = V.getConstantOperandVal(1);
26248 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
26249 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
26250}
26251
26252static SDValue
26254 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26255 "Unexpected opcode!");
26256
26257 SelectionDAG &DAG = DCI.DAG;
26258 SDValue Pred = N->getOperand(0);
26259 SDValue LHS = N->getOperand(1);
26260 SDValue RHS = N->getOperand(2);
26261 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26262
26263 if (SDValue V = performSetCCPunpkCombine(N, DAG))
26264 return V;
26265
26266 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26267 LHS->getOpcode() == ISD::SIGN_EXTEND &&
26268 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
26269 // setcc_merge_zero(
26270 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
26271 // => setcc_merge_zero(pred, ...)
26272 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26273 LHS->getOperand(0)->getOperand(0) == Pred)
26274 return LHS->getOperand(0);
26275
26276 // setcc_merge_zero(
26277 // all_active, extend(nxvNi1 ...), != splat(0))
26278 // -> nxvNi1 ...
26279 if (isAllActivePredicate(DAG, Pred))
26280 return LHS->getOperand(0);
26281
26282 // setcc_merge_zero(
26283 // pred, extend(nxvNi1 ...), != splat(0))
26284 // -> nxvNi1 and(pred, ...)
26285 if (DCI.isAfterLegalizeDAG())
26286 // Do this after legalization to allow more folds on setcc_merge_zero
26287 // to be recognized.
26288 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
26289 LHS->getOperand(0), Pred);
26290 }
26291
26292 // setcc_merge_zero(
26293 // pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
26294 // => setcc_merge_zero(
26295 // pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
26296 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26297 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
26298 SDValue L0 = LHS->getOperand(0);
26299 SDValue L1 = LHS->getOperand(1);
26300 SDValue L2 = LHS->getOperand(2);
26301
26302 if (L0.getOpcode() == ISD::UNDEF && isNullConstant(L2) &&
26303 isSignExtInReg(L1)) {
26304 SDLoc DL(N);
26305 SDValue Shl = L1.getOperand(0);
26307 LHS.getValueType(), L0, Shl, L2);
26308 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
26309 Pred, NewLHS, RHS, N->getOperand(3));
26310 }
26311 }
26312
26313 return SDValue();
26314}
26315
26316// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
26317// as well as whether the test should be inverted. This code is required to
26318// catch these cases (as opposed to standard dag combines) because
26319// AArch64ISD::TBZ is matched during legalization.
26320static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
26321 SelectionDAG &DAG) {
26322
26323 if (!Op->hasOneUse())
26324 return Op;
26325
26326 // We don't handle undef/constant-fold cases below, as they should have
26327 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
26328 // etc.)
26329
26330 // (tbz (trunc x), b) -> (tbz x, b)
26331 // This case is just here to enable more of the below cases to be caught.
26332 if (Op->getOpcode() == ISD::TRUNCATE &&
26333 Bit < Op->getValueType(0).getSizeInBits()) {
26334 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26335 }
26336
26337 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
26338 if (Op->getOpcode() == ISD::ANY_EXTEND &&
26339 Bit < Op->getOperand(0).getValueSizeInBits()) {
26340 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26341 }
26342
26343 if (Op->getNumOperands() != 2)
26344 return Op;
26345
26346 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
26347 if (!C)
26348 return Op;
26349
26350 switch (Op->getOpcode()) {
26351 default:
26352 return Op;
26353
26354 // (tbz (and x, m), b) -> (tbz x, b)
26355 case ISD::AND:
26356 if ((C->getZExtValue() >> Bit) & 1)
26357 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26358 return Op;
26359
26360 // (tbz (shl x, c), b) -> (tbz x, b-c)
26361 case ISD::SHL:
26362 if (C->getZExtValue() <= Bit &&
26363 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26364 Bit = Bit - C->getZExtValue();
26365 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26366 }
26367 return Op;
26368
26369 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
26370 case ISD::SRA:
26371 Bit = Bit + C->getZExtValue();
26372 if (Bit >= Op->getValueType(0).getSizeInBits())
26373 Bit = Op->getValueType(0).getSizeInBits() - 1;
26374 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26375
26376 // (tbz (srl x, c), b) -> (tbz x, b+c)
26377 case ISD::SRL:
26378 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26379 Bit = Bit + C->getZExtValue();
26380 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26381 }
26382 return Op;
26383
26384 // (tbz (xor x, -1), b) -> (tbnz x, b)
26385 case ISD::XOR:
26386 if ((C->getZExtValue() >> Bit) & 1)
26387 Invert = !Invert;
26388 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26389 }
26390}
26391
26392// Optimize test single bit zero/non-zero and branch.
26395 SelectionDAG &DAG) {
26396 unsigned Bit = N->getConstantOperandVal(2);
26397 bool Invert = false;
26398 SDValue TestSrc = N->getOperand(1);
26399 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
26400
26401 if (TestSrc == NewTestSrc)
26402 return SDValue();
26403
26404 unsigned NewOpc = N->getOpcode();
26405 if (Invert) {
26406 if (NewOpc == AArch64ISD::TBZ)
26407 NewOpc = AArch64ISD::TBNZ;
26408 else {
26409 assert(NewOpc == AArch64ISD::TBNZ);
26410 NewOpc = AArch64ISD::TBZ;
26411 }
26412 }
26413
26414 SDLoc DL(N);
26415 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
26416 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
26417}
26418
26419// Swap vselect operands where it may allow a predicated operation to achieve
26420// the `sel`.
26421//
26422// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
26423// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
26425 auto SelectA = N->getOperand(1);
26426 auto SelectB = N->getOperand(2);
26427 auto NTy = N->getValueType(0);
26428
26429 if (!NTy.isScalableVector())
26430 return SDValue();
26431 SDValue SetCC = N->getOperand(0);
26432 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
26433 return SDValue();
26434
26435 switch (SelectB.getOpcode()) {
26436 default:
26437 return SDValue();
26438 case ISD::FMUL:
26439 case ISD::FSUB:
26440 case ISD::FADD:
26441 break;
26442 }
26443 if (SelectA != SelectB.getOperand(0))
26444 return SDValue();
26445
26446 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
26447 ISD::CondCode InverseCC =
26449 auto InverseSetCC =
26450 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
26451 SetCC.getOperand(1), InverseCC);
26452
26453 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
26454 {InverseSetCC, SelectB, SelectA});
26455}
26456
26457// vselect (v1i1 setcc) ->
26458// vselect (v1iXX setcc) (XX is the size of the compared operand type)
26459// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
26460// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
26461// such VSELECT.
26463 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
26464 return SwapResult;
26465
26466 SDValue N0 = N->getOperand(0);
26467 SDValue IfTrue = N->getOperand(1);
26468 SDValue IfFalse = N->getOperand(2);
26469 EVT ResVT = N->getValueType(0);
26470 EVT CCVT = N0.getValueType();
26471
26472 if (isAllActivePredicate(DAG, N0))
26473 return N->getOperand(1);
26474
26475 if (isAllInactivePredicate(N0))
26476 return N->getOperand(2);
26477
26478 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
26479 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
26480 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
26481 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
26482 // -> merge_pasthru_op A, B,{Bn,} C
26483 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
26484 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
26485 IfTrue->getOperand(0) == N0) {
26487 Ops[0] = N0;
26488 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
26489
26490 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
26491 }
26492 }
26493
26494 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
26495 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
26496 // supported types.
26497 SDValue SetCC = N->getOperand(0);
26498 if (SetCC.getOpcode() == ISD::SETCC &&
26499 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
26500 SDValue CmpLHS = SetCC.getOperand(0);
26501 EVT VT = CmpLHS.getValueType();
26502 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
26503 SDNode *SplatLHS = N->getOperand(1).getNode();
26504 SDNode *SplatRHS = N->getOperand(2).getNode();
26505 APInt SplatLHSVal;
26506 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
26507 VT.isSimple() &&
26508 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
26509 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
26510 VT.getSimpleVT().SimpleTy) &&
26511 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
26512 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
26514 unsigned NumElts = VT.getVectorNumElements();
26516 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
26517 VT.getScalarType()));
26518 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
26519
26520 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
26521 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
26522 return Or;
26523 }
26524 }
26525
26526 EVT CmpVT = N0.getOperand(0).getValueType();
26527 if (N0.getOpcode() != ISD::SETCC ||
26529 CCVT.getVectorElementType() != MVT::i1 ||
26531 return SDValue();
26532
26533 // Only combine when the result type is of the same size as the compared
26534 // operands.
26535 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
26536 return SDValue();
26537
26538 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
26539 N0.getOperand(0), N0.getOperand(1),
26540 cast<CondCodeSDNode>(N0.getOperand(2))->get());
26541 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
26542 IfTrue, IfFalse);
26543}
26544
26545/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
26546/// the compare-mask instructions rather than going via NZCV, even if LHS and
26547/// RHS are really scalar. This replaces any scalar setcc in the above pattern
26548/// with a vector one followed by a DUP shuffle on the result.
26551 SelectionDAG &DAG = DCI.DAG;
26552 SDValue N0 = N->getOperand(0);
26553 EVT ResVT = N->getValueType(0);
26554
26555 if (N0.getOpcode() != ISD::SETCC)
26556 return SDValue();
26557
26558 if (ResVT.isScalableVT())
26559 return SDValue();
26560
26561 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
26562 // scalar SetCCResultType. We also don't expect vectors, because we assume
26563 // that selects fed by vector SETCCs are canonicalized to VSELECT.
26564 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
26565 "Scalar-SETCC feeding SELECT has unexpected result type!");
26566
26567 // If NumMaskElts == 0, the comparison is larger than select result. The
26568 // largest real NEON comparison is 64-bits per lane, which means the result is
26569 // at most 32-bits and an illegal vector. Just bail out for now.
26570 EVT SrcVT = N0.getOperand(0).getValueType();
26571
26572 // Don't try to do this optimization when the setcc itself has i1 operands.
26573 // There are no legal vectors of i1, so this would be pointless. v1f16 is
26574 // ruled out to prevent the creation of setcc that need to be scalarized.
26575 if (SrcVT == MVT::i1 ||
26576 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
26577 return SDValue();
26578
26579 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
26580 if (!ResVT.isVector() || NumMaskElts == 0)
26581 return SDValue();
26582
26583 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
26585
26586 // Also bail out if the vector CCVT isn't the same size as ResVT.
26587 // This can happen if the SETCC operand size doesn't divide the ResVT size
26588 // (e.g., f64 vs v3f32).
26589 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
26590 return SDValue();
26591
26592 // Make sure we didn't create illegal types, if we're not supposed to.
26593 assert(DCI.isBeforeLegalize() ||
26594 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
26595
26596 // First perform a vector comparison, where lane 0 is the one we're interested
26597 // in.
26598 SDLoc DL(N0);
26599 SDValue LHS =
26600 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
26601 SDValue RHS =
26602 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
26603 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
26604
26605 // Now duplicate the comparison mask we want across all other lanes.
26606 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
26607 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
26608 Mask = DAG.getNode(ISD::BITCAST, DL,
26609 ResVT.changeVectorElementTypeToInteger(), Mask);
26610
26611 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
26612}
26613
26616 EVT VT = N->getValueType(0);
26617 SDLoc DL(N);
26618 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
26619 // 128bit vector version.
26620 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
26622 SmallVector<SDValue> Ops(N->ops());
26623 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
26624 DCI.DAG.getVTList(LVT), Ops)) {
26625 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
26626 DCI.DAG.getConstant(0, DL, MVT::i64));
26627 }
26628 }
26629
26630 if (N->getOpcode() == AArch64ISD::DUP) {
26631 // If the instruction is known to produce a scalar in SIMD registers, we can
26632 // duplicate it across the vector lanes using DUPLANE instead of moving it
26633 // to a GPR first. For example, this allows us to handle:
26634 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
26635 SDValue Op = N->getOperand(0);
26636 // FIXME: Ideally, we should be able to handle all instructions that
26637 // produce a scalar value in FPRs.
26638 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
26639 Op.getOpcode() == AArch64ISD::FCMGE ||
26640 Op.getOpcode() == AArch64ISD::FCMGT) {
26641 EVT ElemVT = VT.getVectorElementType();
26642 EVT ExpandedVT = VT;
26643 // Insert into a 128-bit vector to match DUPLANE's pattern.
26644 if (VT.getSizeInBits() != 128)
26645 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26646 128 / ElemVT.getSizeInBits());
26647 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
26648 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
26649 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
26650 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
26651 }
26652
26653 if (DCI.isAfterLegalizeDAG()) {
26654 // If scalar dup's operand is extract_vector_elt, try to combine them into
26655 // duplane. For example,
26656 //
26657 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
26658 // t18: v4i32 = AArch64ISD::DUP t21
26659 // ==>
26660 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
26661 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
26662 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26663 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
26664 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
26665 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
26666 EXTRACT_VEC_ELT.getOperand(1));
26667 }
26668 }
26669 }
26670
26671 return performPostLD1Combine(N, DCI, false);
26672 }
26673
26674 return SDValue();
26675}
26676
26677/// Get rid of unnecessary NVCASTs (that don't change the type).
26679 if (N->getValueType(0) == N->getOperand(0).getValueType())
26680 return N->getOperand(0);
26681 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
26682 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
26683 N->getOperand(0).getOperand(0));
26684
26685 return SDValue();
26686}
26687
26688// If all users of the globaladdr are of the form (globaladdr + constant), find
26689// the smallest constant, fold it into the globaladdr's offset and rewrite the
26690// globaladdr as (globaladdr + constant) - constant.
26692 const AArch64Subtarget *Subtarget,
26693 const TargetMachine &TM) {
26694 auto *GN = cast<GlobalAddressSDNode>(N);
26695 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
26697 return SDValue();
26698
26699 uint64_t MinOffset = -1ull;
26700 for (SDNode *N : GN->users()) {
26701 if (N->getOpcode() != ISD::ADD)
26702 return SDValue();
26703 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
26704 if (!C)
26705 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
26706 if (!C)
26707 return SDValue();
26708 MinOffset = std::min(MinOffset, C->getZExtValue());
26709 }
26710 uint64_t Offset = MinOffset + GN->getOffset();
26711
26712 // Require that the new offset is larger than the existing one. Otherwise, we
26713 // can end up oscillating between two possible DAGs, for example,
26714 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
26715 if (Offset <= uint64_t(GN->getOffset()))
26716 return SDValue();
26717
26718 // Check whether folding this offset is legal. It must not go out of bounds of
26719 // the referenced object to avoid violating the code model, and must be
26720 // smaller than 2^20 because this is the largest offset expressible in all
26721 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
26722 // stores an immediate signed 21 bit offset.)
26723 //
26724 // This check also prevents us from folding negative offsets, which will end
26725 // up being treated in the same way as large positive ones. They could also
26726 // cause code model violations, and aren't really common enough to matter.
26727 if (Offset >= (1 << 20))
26728 return SDValue();
26729
26730 const GlobalValue *GV = GN->getGlobal();
26731 Type *T = GV->getValueType();
26732 if (!T->isSized() ||
26734 return SDValue();
26735
26736 SDLoc DL(GN);
26737 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
26738 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
26739 DAG.getConstant(MinOffset, DL, MVT::i64));
26740}
26741
26743 const AArch64Subtarget *Subtarget) {
26744 SDValue BR = N->getOperand(0);
26745 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
26747 return SDValue();
26748
26749 SDLoc DL(N);
26750 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
26751}
26752
26753// Turns the vector of indices into a vector of byte offstes by scaling Offset
26754// by (BitWidth / 8).
26756 SDLoc DL, unsigned BitWidth) {
26757 assert(Offset.getValueType().isScalableVector() &&
26758 "This method is only for scalable vectors of offsets");
26759
26760 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
26761 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
26762
26763 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
26764}
26765
26766/// Check if the value of \p OffsetInBytes can be used as an immediate for
26767/// the gather load/prefetch and scatter store instructions with vector base and
26768/// immediate offset addressing mode:
26769///
26770/// [<Zn>.[S|D]{, #<imm>}]
26771///
26772/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26773inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
26774 unsigned ScalarSizeInBytes) {
26775 // The immediate is not a multiple of the scalar size.
26776 if (OffsetInBytes % ScalarSizeInBytes)
26777 return false;
26778
26779 // The immediate is out of range.
26780 if (OffsetInBytes / ScalarSizeInBytes > 31)
26781 return false;
26782
26783 return true;
26784}
26785
26786/// Check if the value of \p Offset represents a valid immediate for the SVE
26787/// gather load/prefetch and scatter store instructiona with vector base and
26788/// immediate offset addressing mode:
26789///
26790/// [<Zn>.[S|D]{, #<imm>}]
26791///
26792/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26794 unsigned ScalarSizeInBytes) {
26795 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
26796 return OffsetConst && isValidImmForSVEVecImmAddrMode(
26797 OffsetConst->getZExtValue(), ScalarSizeInBytes);
26798}
26799
26801 unsigned Opcode,
26802 bool OnlyPackedOffsets = true) {
26803 const SDValue Src = N->getOperand(2);
26804 const EVT SrcVT = Src->getValueType(0);
26805 assert(SrcVT.isScalableVector() &&
26806 "Scatter stores are only possible for SVE vectors");
26807
26808 SDLoc DL(N);
26809 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
26810
26811 // Make sure that source data will fit into an SVE register
26813 return SDValue();
26814
26815 // For FPs, ACLE only supports _packed_ single and double precision types.
26816 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
26817 if (SrcElVT.isFloatingPoint())
26818 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
26819 ((Opcode != AArch64ISD::SST1Q_PRED &&
26820 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
26821 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
26822 return SDValue();
26823
26824 // Depending on the addressing mode, this is either a pointer or a vector of
26825 // pointers (that fits into one register)
26826 SDValue Base = N->getOperand(4);
26827 // Depending on the addressing mode, this is either a single offset or a
26828 // vector of offsets (that fits into one register)
26829 SDValue Offset = N->getOperand(5);
26830
26831 // For "scalar + vector of indices", just scale the indices. This only
26832 // applies to non-temporal scatters because there's no instruction that takes
26833 // indices.
26834 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
26835 Offset =
26837 Opcode = AArch64ISD::SSTNT1_PRED;
26838 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
26839 Offset =
26841 Opcode = AArch64ISD::SST1Q_PRED;
26842 }
26843
26844 // In the case of non-temporal gather loads there's only one SVE instruction
26845 // per data-size: "scalar + vector", i.e.
26846 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26847 // Since we do have intrinsics that allow the arguments to be in a different
26848 // order, we may need to swap them to match the spec.
26849 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
26850 Offset.getValueType().isVector())
26852
26853 // SST1_IMM requires that the offset is an immediate that is:
26854 // * a multiple of #SizeInBytes,
26855 // * in the range [0, 31 x #SizeInBytes],
26856 // where #SizeInBytes is the size in bytes of the stored items. For
26857 // immediates outside that range and non-immediate scalar offsets use SST1 or
26858 // SST1_UXTW instead.
26859 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
26861 SrcVT.getScalarSizeInBits() / 8)) {
26862 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26863 Opcode = AArch64ISD::SST1_UXTW_PRED;
26864 else
26865 Opcode = AArch64ISD::SST1_PRED;
26866
26868 }
26869 }
26870
26871 auto &TLI = DAG.getTargetLoweringInfo();
26872 if (!TLI.isTypeLegal(Base.getValueType()))
26873 return SDValue();
26874
26875 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
26876 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26877 // nxv2i64. Legalize accordingly.
26878 if (!OnlyPackedOffsets &&
26879 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26880 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26881
26882 if (!TLI.isTypeLegal(Offset.getValueType()))
26883 return SDValue();
26884
26885 // Source value type that is representable in hardware
26886 EVT HwSrcVt = getSVEContainerType(SrcVT);
26887
26888 // Keep the original type of the input data to store - this is needed to be
26889 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
26890 // FP values we want the integer equivalent, so just use HwSrcVt.
26891 SDValue InputVT = DAG.getValueType(SrcVT);
26892 if (SrcVT.isFloatingPoint())
26893 InputVT = DAG.getValueType(HwSrcVt);
26894
26895 SDVTList VTs = DAG.getVTList(MVT::Other);
26896 SDValue SrcNew;
26897
26898 if (Src.getValueType().isFloatingPoint())
26899 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
26900 else
26901 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
26902
26903 SDValue Ops[] = {N->getOperand(0), // Chain
26904 SrcNew,
26905 N->getOperand(3), // Pg
26906 Base,
26907 Offset,
26908 InputVT};
26909
26910 return DAG.getNode(Opcode, DL, VTs, Ops);
26911}
26912
26914 unsigned Opcode,
26915 bool OnlyPackedOffsets = true) {
26916 const EVT RetVT = N->getValueType(0);
26917 assert(RetVT.isScalableVector() &&
26918 "Gather loads are only possible for SVE vectors");
26919
26920 SDLoc DL(N);
26921
26922 // Make sure that the loaded data will fit into an SVE register
26924 return SDValue();
26925
26926 // Depending on the addressing mode, this is either a pointer or a vector of
26927 // pointers (that fits into one register)
26928 SDValue Base = N->getOperand(3);
26929 // Depending on the addressing mode, this is either a single offset or a
26930 // vector of offsets (that fits into one register)
26931 SDValue Offset = N->getOperand(4);
26932
26933 // For "scalar + vector of indices", scale the indices to obtain unscaled
26934 // offsets. This applies to non-temporal and quadword gathers, which do not
26935 // have an addressing mode with scaled offset.
26936 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
26938 RetVT.getScalarSizeInBits());
26939 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
26940 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
26942 RetVT.getScalarSizeInBits());
26943 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
26944 }
26945
26946 // In the case of non-temporal gather loads and quadword gather loads there's
26947 // only one addressing mode : "vector + scalar", e.g.
26948 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26949 // Since we do have intrinsics that allow the arguments to be in a different
26950 // order, we may need to swap them to match the spec.
26951 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
26952 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
26953 Offset.getValueType().isVector())
26955
26956 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26957 // * a multiple of #SizeInBytes,
26958 // * in the range [0, 31 x #SizeInBytes],
26959 // where #SizeInBytes is the size in bytes of the loaded items. For
26960 // immediates outside that range and non-immediate scalar offsets use
26961 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26962 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26963 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26965 RetVT.getScalarSizeInBits() / 8)) {
26966 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26967 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26968 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26969 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26970 else
26971 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26972 ? AArch64ISD::GLD1_MERGE_ZERO
26973 : AArch64ISD::GLDFF1_MERGE_ZERO;
26974
26976 }
26977 }
26978
26979 auto &TLI = DAG.getTargetLoweringInfo();
26980 if (!TLI.isTypeLegal(Base.getValueType()))
26981 return SDValue();
26982
26983 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26984 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26985 // nxv2i64. Legalize accordingly.
26986 if (!OnlyPackedOffsets &&
26987 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26988 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26989
26990 // Return value type that is representable in hardware
26991 EVT HwRetVt = getSVEContainerType(RetVT);
26992
26993 // Keep the original output value type around - this is needed to be able to
26994 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26995 // values we want the integer equivalent, so just use HwRetVT.
26996 SDValue OutVT = DAG.getValueType(RetVT);
26997 if (RetVT.isFloatingPoint())
26998 OutVT = DAG.getValueType(HwRetVt);
26999
27000 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
27001 SDValue Ops[] = {N->getOperand(0), // Chain
27002 N->getOperand(2), // Pg
27003 Base, Offset, OutVT};
27004
27005 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
27006 SDValue LoadChain = SDValue(Load.getNode(), 1);
27007
27008 if (RetVT.isInteger() && (RetVT != HwRetVt))
27009 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
27010
27011 // If the original return value was FP, bitcast accordingly. Doing it here
27012 // means that we can avoid adding TableGen patterns for FPs.
27013 if (RetVT.isFloatingPoint())
27014 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
27015
27016 return DAG.getMergeValues({Load, LoadChain}, DL);
27017}
27018
27019static SDValue
27021 SelectionDAG &DAG) {
27022 SDLoc DL(N);
27023 SDValue Src = N->getOperand(0);
27024 unsigned Opc = Src->getOpcode();
27025
27026 // Sign extend of an unsigned unpack -> signed unpack
27027 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
27028
27029 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
27030 : AArch64ISD::SUNPKLO;
27031
27032 // Push the sign extend to the operand of the unpack
27033 // This is necessary where, for example, the operand of the unpack
27034 // is another unpack:
27035 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
27036 // ->
27037 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
27038 // ->
27039 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
27040 SDValue ExtOp = Src->getOperand(0);
27041 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
27042 EVT EltTy = VT.getVectorElementType();
27043 (void)EltTy;
27044
27045 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
27046 "Sign extending from an invalid type");
27047
27048 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
27049
27051 ExtOp, DAG.getValueType(ExtVT));
27052
27053 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
27054 }
27055
27056 // Sign extend of CSET -> CSETM.
27057 if (Opc == AArch64ISD::CSEL &&
27058 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
27059 EVT VT = N->getValueType(0);
27060 SDValue TVal = Src.getOperand(0);
27061 SDValue FVal = Src.getOperand(1);
27062
27063 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
27064 if (isNullConstant(TVal) && isOneConstant(FVal))
27065 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
27066 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
27067 Src.getOperand(3));
27068
27069 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
27070 if (isOneConstant(TVal) && isNullConstant(FVal))
27071 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
27072 DAG.getAllOnesConstant(DL, VT), FVal,
27073 Src.getOperand(2), Src.getOperand(3));
27074 }
27075
27076 if (DCI.isBeforeLegalizeOps())
27077 return SDValue();
27078
27080 return SDValue();
27081
27082 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
27083 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
27084 unsigned NewOpc;
27085 unsigned MemVTOpNum = 4;
27086 switch (Opc) {
27087 case AArch64ISD::LD1_MERGE_ZERO:
27088 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
27089 MemVTOpNum = 3;
27090 break;
27091 case AArch64ISD::LDNF1_MERGE_ZERO:
27092 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
27093 MemVTOpNum = 3;
27094 break;
27095 case AArch64ISD::LDFF1_MERGE_ZERO:
27096 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
27097 MemVTOpNum = 3;
27098 break;
27099 case AArch64ISD::GLD1_MERGE_ZERO:
27100 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
27101 break;
27102 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27103 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
27104 break;
27105 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27106 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
27107 break;
27108 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27109 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
27110 break;
27111 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27112 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
27113 break;
27114 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27115 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
27116 break;
27117 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27118 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
27119 break;
27120 case AArch64ISD::GLDFF1_MERGE_ZERO:
27121 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
27122 break;
27123 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
27124 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
27125 break;
27126 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
27127 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
27128 break;
27129 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
27130 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
27131 break;
27132 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
27133 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
27134 break;
27135 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
27136 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
27137 break;
27138 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
27139 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
27140 break;
27141 case AArch64ISD::GLDNT1_MERGE_ZERO:
27142 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
27143 break;
27144 default:
27145 return SDValue();
27146 }
27147
27148 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
27149 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
27150
27151 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
27152 return SDValue();
27153
27154 EVT DstVT = N->getValueType(0);
27155 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
27156
27158 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
27159 Ops.push_back(Src->getOperand(I));
27160
27161 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
27162 DCI.CombineTo(N, ExtLoad);
27163 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
27164
27165 // Return N so it doesn't get rechecked
27166 return SDValue(N, 0);
27167}
27168
27169/// Legalize the gather prefetch (scalar + vector addressing mode) when the
27170/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
27171/// != nxv2i32) do not need legalization.
27173 const unsigned OffsetPos = 4;
27174 SDValue Offset = N->getOperand(OffsetPos);
27175
27176 // Not an unpacked vector, bail out.
27177 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
27178 return SDValue();
27179
27180 // Extend the unpacked offset vector to 64-bit lanes.
27181 SDLoc DL(N);
27182 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
27184 // Replace the offset operand with the 64-bit one.
27185 Ops[OffsetPos] = Offset;
27186
27187 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27188}
27189
27190/// Combines a node carrying the intrinsic
27191/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
27192/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
27193/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
27194/// sve gather prefetch instruction with vector plus immediate addressing mode.
27196 unsigned ScalarSizeInBytes) {
27197 const unsigned ImmPos = 4, OffsetPos = 3;
27198 // No need to combine the node if the immediate is valid...
27199 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
27200 return SDValue();
27201
27202 // ...otherwise swap the offset base with the offset...
27204 std::swap(Ops[ImmPos], Ops[OffsetPos]);
27205 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
27206 // `aarch64_sve_prfb_gather_uxtw_index`.
27207 SDLoc DL(N);
27208 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
27209 MVT::i64);
27210
27211 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27212}
27213
27214// Return true if the vector operation can guarantee only the first lane of its
27215// result contains data, with all bits in other lanes set to zero.
27217 switch (Op.getOpcode()) {
27218 default:
27219 return false;
27220 case AArch64ISD::ANDV_PRED:
27221 case AArch64ISD::EORV_PRED:
27222 case AArch64ISD::FADDA_PRED:
27223 case AArch64ISD::FADDV_PRED:
27224 case AArch64ISD::FMAXNMV_PRED:
27225 case AArch64ISD::FMAXV_PRED:
27226 case AArch64ISD::FMINNMV_PRED:
27227 case AArch64ISD::FMINV_PRED:
27228 case AArch64ISD::ORV_PRED:
27229 case AArch64ISD::SADDV_PRED:
27230 case AArch64ISD::SMAXV_PRED:
27231 case AArch64ISD::SMINV_PRED:
27232 case AArch64ISD::UADDV_PRED:
27233 case AArch64ISD::UMAXV_PRED:
27234 case AArch64ISD::UMINV_PRED:
27235 return true;
27236 }
27237}
27238
27239// Return true if the vector operation can guarantee that the first lane of its
27240// result is active.
27242 switch (Op.getOpcode()) {
27243 default:
27244 return false;
27245 case AArch64ISD::REINTERPRET_CAST:
27246 return isLane0KnownActive(Op->getOperand(0));
27247 case ISD::SPLAT_VECTOR:
27248 return isOneConstant(Op.getOperand(0));
27249 case AArch64ISD::PTRUE:
27250 return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
27251 };
27252}
27253
27255 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
27256 SDValue InsertVec = N->getOperand(0);
27257 SDValue InsertElt = N->getOperand(1);
27258 SDValue InsertIdx = N->getOperand(2);
27259
27260 // We only care about inserts into the first element...
27261 if (!isNullConstant(InsertIdx))
27262 return SDValue();
27263 // ...of a zero'd vector...
27265 return SDValue();
27266 // ...where the inserted data was previously extracted...
27267 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
27268 return SDValue();
27269
27270 SDValue ExtractVec = InsertElt.getOperand(0);
27271 SDValue ExtractIdx = InsertElt.getOperand(1);
27272
27273 // ...from the first element of a vector.
27274 if (!isNullConstant(ExtractIdx))
27275 return SDValue();
27276
27277 // If we get here we are effectively trying to zero lanes 1-N of a vector.
27278
27279 // Ensure there's no type conversion going on.
27280 if (N->getValueType(0) != ExtractVec.getValueType())
27281 return SDValue();
27282
27283 if (!isLanes1toNKnownZero(ExtractVec))
27284 return SDValue();
27285
27286 // The explicit zeroing is redundant.
27287 return ExtractVec;
27288}
27289
27290static SDValue
27293 return Res;
27294
27295 return performPostLD1Combine(N, DCI, true);
27296}
27297
27300 const AArch64Subtarget *Subtarget) {
27301 SDValue N0 = N->getOperand(0);
27302 EVT VT = N->getValueType(0);
27303
27304 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
27305 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
27306 return SDValue();
27307
27308 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
27309 EVT EltVT = VT.getVectorElementType();
27310 return EltVT == MVT::f32 || EltVT == MVT::f64;
27311 };
27312
27313 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
27314 // We purposefully don't care about legality of the nodes here as we know
27315 // they can be split down into something legal.
27316 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
27317 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
27318 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
27319 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
27320 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
27321 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
27322 LN0->getChain(), LN0->getBasePtr(),
27323 N0.getValueType(), LN0->getMemOperand());
27324 DCI.CombineTo(N, ExtLoad);
27325 DCI.CombineTo(
27326 N0.getNode(),
27327 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
27328 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
27329 ExtLoad.getValue(1));
27330 return SDValue(N, 0); // Return N so it doesn't get rechecked!
27331 }
27332
27333 return SDValue();
27334}
27335
27337 const AArch64Subtarget *Subtarget) {
27338 EVT VT = N->getValueType(0);
27339
27340 // Don't expand for NEON, SVE2 or SME
27341 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
27342 return SDValue();
27343
27344 SDLoc DL(N);
27345
27346 SDValue Mask = N->getOperand(0);
27347 SDValue In1 = N->getOperand(1);
27348 SDValue In2 = N->getOperand(2);
27349
27350 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
27351 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
27352 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
27353 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
27354}
27355
27357 EVT VT = N->getValueType(0);
27358
27359 SDValue Insert = N->getOperand(0);
27360 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
27361 return SDValue();
27362
27363 if (!Insert.getOperand(0).isUndef())
27364 return SDValue();
27365
27366 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
27367 uint64_t IdxDupLane = N->getConstantOperandVal(1);
27368 if (IdxInsert != 0 || IdxDupLane != 0)
27369 return SDValue();
27370
27371 SDValue Bitcast = Insert.getOperand(1);
27372 if (Bitcast.getOpcode() != ISD::BITCAST)
27373 return SDValue();
27374
27375 SDValue Subvec = Bitcast.getOperand(0);
27376 EVT SubvecVT = Subvec.getValueType();
27377 if (!SubvecVT.is128BitVector())
27378 return SDValue();
27379 EVT NewSubvecVT =
27381
27382 SDLoc DL(N);
27383 SDValue NewInsert =
27384 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
27385 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
27386 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
27387 NewInsert, N->getOperand(1));
27388 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
27389}
27390
27391// Try to combine mull with uzp1.
27394 SelectionDAG &DAG) {
27395 if (DCI.isBeforeLegalizeOps())
27396 return SDValue();
27397
27398 SDValue LHS = N->getOperand(0);
27399 SDValue RHS = N->getOperand(1);
27400
27401 SDValue ExtractHigh;
27402 SDValue ExtractLow;
27403 SDValue TruncHigh;
27404 SDValue TruncLow;
27405 SDLoc DL(N);
27406
27407 // Check the operands are trunc and extract_high.
27409 RHS.getOpcode() == ISD::TRUNCATE) {
27410 TruncHigh = RHS;
27411 if (LHS.getOpcode() == ISD::BITCAST)
27412 ExtractHigh = LHS.getOperand(0);
27413 else
27414 ExtractHigh = LHS;
27416 LHS.getOpcode() == ISD::TRUNCATE) {
27417 TruncHigh = LHS;
27418 if (RHS.getOpcode() == ISD::BITCAST)
27419 ExtractHigh = RHS.getOperand(0);
27420 else
27421 ExtractHigh = RHS;
27422 } else
27423 return SDValue();
27424
27425 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27426 // with uzp1.
27427 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27428 SDValue TruncHighOp = TruncHigh.getOperand(0);
27429 EVT TruncHighOpVT = TruncHighOp.getValueType();
27430 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
27431 DAG.isSplatValue(TruncHighOp, false))
27432 return SDValue();
27433
27434 // Check there is other extract_high with same source vector.
27435 // For example,
27436 //
27437 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
27438 // t12: v4i16 = truncate t11
27439 // t31: v4i32 = AArch64ISD::SMULL t18, t12
27440 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
27441 // t16: v4i16 = truncate t15
27442 // t30: v4i32 = AArch64ISD::SMULL t23, t1
27443 //
27444 // This dagcombine assumes the two extract_high uses same source vector in
27445 // order to detect the pair of the mull. If they have different source vector,
27446 // this code will not work.
27447 // TODO: Should also try to look through a bitcast.
27448 bool HasFoundMULLow = true;
27449 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
27450 if (ExtractHighSrcVec->use_size() != 2)
27451 HasFoundMULLow = false;
27452
27453 // Find ExtractLow.
27454 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
27455 if (User == ExtractHigh.getNode())
27456 continue;
27457
27458 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27460 HasFoundMULLow = false;
27461 break;
27462 }
27463
27464 ExtractLow.setNode(User);
27465 }
27466
27467 if (!ExtractLow || !ExtractLow->hasOneUse())
27468 HasFoundMULLow = false;
27469
27470 // Check ExtractLow's user.
27471 if (HasFoundMULLow) {
27472 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
27473 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
27474 HasFoundMULLow = false;
27475 } else {
27476 if (ExtractLowUser->getOperand(0) == ExtractLow) {
27477 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
27478 TruncLow = ExtractLowUser->getOperand(1);
27479 else
27480 HasFoundMULLow = false;
27481 } else {
27482 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
27483 TruncLow = ExtractLowUser->getOperand(0);
27484 else
27485 HasFoundMULLow = false;
27486 }
27487 }
27488 }
27489
27490 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27491 // with uzp1.
27492 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27493 EVT TruncHighVT = TruncHigh.getValueType();
27494 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27495 SDValue TruncLowOp =
27496 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
27497 EVT TruncLowOpVT = TruncLowOp.getValueType();
27498 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
27499 DAG.isSplatValue(TruncLowOp, false)))
27500 return SDValue();
27501
27502 // Create uzp1, extract_high and extract_low.
27503 if (TruncHighOpVT != UZP1VT)
27504 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
27505 if (TruncLowOpVT != UZP1VT)
27506 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
27507
27508 SDValue UZP1 =
27509 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
27510 SDValue HighIdxCst =
27511 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
27512 SDValue NewTruncHigh =
27513 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
27514 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
27515
27516 if (HasFoundMULLow) {
27517 EVT TruncLowVT = TruncLow.getValueType();
27518 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
27519 UZP1, ExtractLow.getOperand(1));
27520 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
27521 }
27522
27523 return SDValue(N, 0);
27524}
27525
27528 SelectionDAG &DAG) {
27529 if (SDValue Val =
27531 return Val;
27532
27533 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
27534 return Val;
27535
27536 return SDValue();
27537}
27538
27541 SelectionDAG &DAG) {
27542 if (DCI.isBeforeLegalize())
27543 return SDValue();
27544
27545 SDLoc DL(N);
27546 auto Mask = N->getOperand(0);
27547 auto Pred = N->getOperand(1);
27548
27549 if (!isLane0KnownActive(Mask))
27550 return SDValue();
27551
27552 if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
27553 Pred = Pred->getOperand(0);
27554
27555 if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
27556 Pred = Pred->getOperand(0);
27557 Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
27558 return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
27559 Pred);
27560 }
27561
27562 return SDValue();
27563}
27564
27565static SDValue
27567 SelectionDAG &DAG) {
27568 // Let's do below transform.
27569 //
27570 // t34: v4i32 = AArch64ISD::UADDLV t2
27571 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
27572 // t7: i64 = zero_extend t35
27573 // t20: v1i64 = scalar_to_vector t7
27574 // ==>
27575 // t34: v4i32 = AArch64ISD::UADDLV t2
27576 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
27577 // t40: v1i64 = AArch64ISD::NVCAST t39
27578 if (DCI.isBeforeLegalizeOps())
27579 return SDValue();
27580
27581 EVT VT = N->getValueType(0);
27582 if (VT != MVT::v1i64)
27583 return SDValue();
27584
27585 SDValue ZEXT = N->getOperand(0);
27586 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
27587 return SDValue();
27588
27589 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
27590 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
27591 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
27592 return SDValue();
27593
27594 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
27595 return SDValue();
27596
27597 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
27598 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
27599 UADDLV.getValueType() != MVT::v4i32 ||
27600 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
27601 return SDValue();
27602
27603 // Let's generate new sequence with AArch64ISD::NVCAST.
27604 SDLoc DL(N);
27605 SDValue EXTRACT_SUBVEC =
27606 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
27607 DAG.getConstant(0, DL, MVT::i64));
27608 SDValue NVCAST =
27609 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
27610
27611 return NVCAST;
27612}
27613
27616 if (!DCI.isBeforeLegalize())
27617 return SDValue();
27618
27619 unsigned NumParts = N->getNumOperands();
27620 if (NumParts != 2 && NumParts != 4)
27621 return SDValue();
27622
27623 EVT SubVecTy = N->getValueType(0);
27624
27625 // At the moment we're unlikely to see a fixed-width vector deinterleave as
27626 // we usually generate shuffles instead.
27627 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
27628 if (!SubVecTy.isScalableVector() ||
27629 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
27630 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
27631 return SDValue();
27632
27633 // Make sure each input operand is the correct extract_subvector of the same
27634 // wider vector.
27635 SDValue Op0 = N->getOperand(0);
27636 for (unsigned I = 0; I < NumParts; I++) {
27637 SDValue OpI = N->getOperand(I);
27638 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27639 OpI->getOperand(0) != Op0->getOperand(0))
27640 return SDValue();
27641 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
27642 return SDValue();
27643 }
27644
27645 // Normal loads are currently already handled by the InterleavedAccessPass so
27646 // we don't expect to see them here. Bail out if the masked load has an
27647 // unexpected number of uses, since we want to avoid a situation where we have
27648 // both deinterleaving loads and normal loads in the same block. Also, discard
27649 // masked loads that are extending, indexed, have an unexpected offset or have
27650 // an unsupported passthru value until we find a valid use case.
27651 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
27652 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
27653 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
27654 !MaskedLoad->getOffset().isUndef() ||
27655 (!MaskedLoad->getPassThru()->isUndef() &&
27656 !isZerosVector(MaskedLoad->getPassThru().getNode())))
27657 return SDValue();
27658
27659 // Now prove that the mask is an interleave of identical masks.
27660 SDLoc DL(N);
27661 SDValue NarrowMask =
27662 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
27663 if (!NarrowMask)
27664 return SDValue();
27665
27666 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
27667 : Intrinsic::aarch64_sve_ld4_sret;
27668 SDValue NewLdOps[] = {MaskedLoad->getChain(),
27669 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
27670 MaskedLoad->getBasePtr()};
27671 SDValue Res;
27672 if (NumParts == 2)
27674 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
27675 else
27677 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
27678 NewLdOps);
27679
27680 // We can now generate a structured load!
27681 SmallVector<SDValue, 4> ResOps(NumParts);
27682 for (unsigned Idx = 0; Idx < NumParts; Idx++)
27683 ResOps[Idx] = SDValue(Res.getNode(), Idx);
27684
27685 // Replace uses of the original chain result with the new chain result.
27686 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
27687 SDValue(Res.getNode(), NumParts));
27688 return DCI.CombineTo(N, ResOps, false);
27689}
27690
27691/// If the operand is a bitwise AND with a constant RHS, and the shift has a
27692/// constant RHS and is the only use, we can pull it out of the shift, i.e.
27693///
27694/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
27695///
27696/// We prefer this canonical form to match existing isel patterns.
27699 SelectionDAG &DAG) {
27700 if (DCI.isBeforeLegalizeOps())
27701 return SDValue();
27702
27703 SDValue Op0 = N->getOperand(0);
27704 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
27705 return SDValue();
27706
27707 SDValue C1 = Op0->getOperand(1);
27708 SDValue C2 = N->getOperand(1);
27710 return SDValue();
27711
27712 // Might be folded into shifted op, do not lower.
27713 if (N->hasOneUse()) {
27714 unsigned UseOpc = N->user_begin()->getOpcode();
27715 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
27716 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
27717 return SDValue();
27718 }
27719
27720 SDLoc DL(N);
27721 EVT VT = N->getValueType(0);
27722
27723 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
27724 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
27725 // causing infinite loop. Result may also be worse.
27726 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
27727 if (!isa<ConstantSDNode>(NewRHS))
27728 return SDValue();
27729
27730 SDValue X = Op0->getOperand(0);
27731 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
27732 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
27733}
27734
27736 unsigned IntrinsicID = N->getConstantOperandVal(1);
27737 auto Register =
27738 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
27739 : AArch64SysReg::RNDRRS);
27740 SDLoc DL(N);
27741 SDValue A = DAG.getNode(
27742 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
27743 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
27744 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
27745 DAG.getConstant(0, DL, MVT::i32),
27746 DAG.getConstant(0, DL, MVT::i32),
27747 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
27748 return DAG.getMergeValues(
27749 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
27750}
27751
27753 DAGCombinerInfo &DCI) const {
27754 SelectionDAG &DAG = DCI.DAG;
27755 switch (N->getOpcode()) {
27756 default:
27757 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
27758 break;
27760 return performVectorDeinterleaveCombine(N, DCI, DAG);
27761 case ISD::VECREDUCE_AND:
27762 case ISD::VECREDUCE_OR:
27763 case ISD::VECREDUCE_XOR:
27764 return performVecReduceBitwiseCombine(N, DCI, DAG);
27765 case ISD::ADD:
27766 case ISD::SUB:
27767 return performAddSubCombine(N, DCI);
27768 case ISD::BUILD_VECTOR:
27769 return performBuildVectorCombine(N, DCI, DAG);
27770 case ISD::SMIN:
27771 return performSMINCombine(N, DAG);
27772 case ISD::TRUNCATE:
27773 return performTruncateCombine(N, DAG, DCI);
27774 case AArch64ISD::ANDS:
27775 return performFlagSettingCombine(N, DCI, ISD::AND);
27776 case AArch64ISD::ADC:
27777 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27778 return R;
27779 return foldADCToCINC(N, DAG);
27780 case AArch64ISD::SBC:
27781 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
27782 case AArch64ISD::ADCS:
27783 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27784 return R;
27785 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
27786 case AArch64ISD::SBCS:
27787 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
27788 return R;
27789 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
27790 case AArch64ISD::ADDS:
27791 return performFlagSettingCombine(N, DCI, ISD::ADD);
27792 case AArch64ISD::SUBS:
27793 return performFlagSettingCombine(N, DCI, ISD::SUB);
27794 case AArch64ISD::BICi: {
27796 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
27797 APInt DemandedElts =
27798 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
27799
27801 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
27802 return SDValue();
27803
27804 break;
27805 }
27806 case ISD::XOR:
27807 return performXorCombine(N, DAG, DCI, Subtarget);
27808 case ISD::MUL:
27809 return performMulCombine(N, DAG, DCI, Subtarget);
27810 case ISD::SINT_TO_FP:
27811 case ISD::UINT_TO_FP:
27812 return performIntToFpCombine(N, DAG, DCI, Subtarget);
27813 case ISD::FP_TO_SINT:
27814 case ISD::FP_TO_UINT:
27817 return performFpToIntCombine(N, DAG, DCI, Subtarget);
27818 case ISD::OR:
27819 return performORCombine(N, DCI, Subtarget, *this);
27820 case ISD::AND:
27821 return performANDCombine(N, DCI);
27822 case ISD::FADD:
27823 return performFADDCombine(N, DCI);
27825 return performIntrinsicCombine(N, DCI, Subtarget);
27826 case ISD::ANY_EXTEND:
27827 case ISD::ZERO_EXTEND:
27828 case ISD::SIGN_EXTEND:
27829 return performExtendCombine(N, DCI, DAG);
27831 return performSignExtendInRegCombine(N, DCI, DAG);
27833 return performConcatVectorsCombine(N, DCI, DAG);
27835 return performExtractSubvectorCombine(N, DCI, DAG);
27837 return performInsertSubvectorCombine(N, DCI, DAG);
27838 case ISD::SELECT:
27839 return performSelectCombine(N, DCI);
27840 case ISD::VSELECT:
27841 return performVSelectCombine(N, DCI.DAG);
27842 case ISD::SETCC:
27843 return performSETCCCombine(N, DCI, DAG);
27844 case ISD::LOAD:
27845 return performLOADCombine(N, DCI, DAG, Subtarget);
27846 case ISD::STORE:
27847 return performSTORECombine(N, DCI, DAG, Subtarget);
27848 case ISD::MSTORE:
27849 return performMSTORECombine(N, DCI, DAG, Subtarget);
27850 case ISD::MGATHER:
27851 case ISD::MSCATTER:
27852 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
27853 return performMaskedGatherScatterCombine(N, DCI, DAG);
27854 case ISD::FP_EXTEND:
27855 return performFPExtendCombine(N, DAG, DCI, Subtarget);
27856 case AArch64ISD::BRCOND:
27857 return performBRCONDCombine(N, DCI, DAG);
27858 case AArch64ISD::TBNZ:
27859 case AArch64ISD::TBZ:
27860 return performTBZCombine(N, DCI, DAG);
27861 case AArch64ISD::CSEL:
27862 return performCSELCombine(N, DCI, DAG);
27863 case AArch64ISD::DUP:
27864 case AArch64ISD::DUPLANE8:
27865 case AArch64ISD::DUPLANE16:
27866 case AArch64ISD::DUPLANE32:
27867 case AArch64ISD::DUPLANE64:
27868 return performDUPCombine(N, DCI);
27869 case AArch64ISD::DUPLANE128:
27870 return performDupLane128Combine(N, DAG);
27871 case AArch64ISD::NVCAST:
27872 return performNVCASTCombine(N, DAG);
27873 case AArch64ISD::SPLICE:
27874 return performSpliceCombine(N, DAG);
27875 case AArch64ISD::UUNPKLO:
27876 case AArch64ISD::UUNPKHI:
27877 return performUnpackCombine(N, DAG, Subtarget);
27878 case AArch64ISD::UZP1:
27879 case AArch64ISD::UZP2:
27880 return performUzpCombine(N, DAG, Subtarget);
27881 case AArch64ISD::SETCC_MERGE_ZERO:
27882 return performSetccMergeZeroCombine(N, DCI);
27883 case AArch64ISD::REINTERPRET_CAST:
27885 case AArch64ISD::GLD1_MERGE_ZERO:
27886 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27887 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27888 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27889 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27890 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27891 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27892 case AArch64ISD::GLD1S_MERGE_ZERO:
27893 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
27894 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
27895 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
27896 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
27897 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
27898 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
27899 return performGLD1Combine(N, DAG);
27900 case AArch64ISD::VASHR:
27901 case AArch64ISD::VLSHR:
27902 return performVectorShiftCombine(N, *this, DCI);
27903 case AArch64ISD::SUNPKLO:
27904 return performSunpkloCombine(N, DAG);
27905 case AArch64ISD::BSP:
27906 return performBSPExpandForSVE(N, DAG, Subtarget);
27908 return performInsertVectorEltCombine(N, DCI);
27910 return performExtractVectorEltCombine(N, DCI, Subtarget);
27911 case ISD::VECREDUCE_ADD:
27912 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
27913 case ISD::GET_ACTIVE_LANE_MASK:
27914 return performActiveLaneMaskCombine(N, DCI, Subtarget);
27915 case AArch64ISD::UADDV:
27916 return performUADDVCombine(N, DAG);
27917 case AArch64ISD::SMULL:
27918 case AArch64ISD::UMULL:
27919 case AArch64ISD::PMULL:
27920 return performMULLCombine(N, DCI, DAG);
27921 case AArch64ISD::PTEST_FIRST:
27922 return performPTestFirstCombine(N, DCI, DAG);
27925 switch (N->getConstantOperandVal(1)) {
27926 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
27927 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
27928 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
27929 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
27930 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
27931 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
27932 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
27933 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
27934 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
27935 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
27936 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
27937 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
27938 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
27939 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
27940 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
27941 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
27943 case Intrinsic::aarch64_neon_ld2:
27944 case Intrinsic::aarch64_neon_ld3:
27945 case Intrinsic::aarch64_neon_ld4:
27946 case Intrinsic::aarch64_neon_ld1x2:
27947 case Intrinsic::aarch64_neon_ld1x3:
27948 case Intrinsic::aarch64_neon_ld1x4:
27949 case Intrinsic::aarch64_neon_ld2lane:
27950 case Intrinsic::aarch64_neon_ld3lane:
27951 case Intrinsic::aarch64_neon_ld4lane:
27952 case Intrinsic::aarch64_neon_ld2r:
27953 case Intrinsic::aarch64_neon_ld3r:
27954 case Intrinsic::aarch64_neon_ld4r:
27955 case Intrinsic::aarch64_neon_st2:
27956 case Intrinsic::aarch64_neon_st3:
27957 case Intrinsic::aarch64_neon_st4:
27958 case Intrinsic::aarch64_neon_st1x2:
27959 case Intrinsic::aarch64_neon_st1x3:
27960 case Intrinsic::aarch64_neon_st1x4:
27961 case Intrinsic::aarch64_neon_st2lane:
27962 case Intrinsic::aarch64_neon_st3lane:
27963 case Intrinsic::aarch64_neon_st4lane:
27964 return performNEONPostLDSTCombine(N, DCI, DAG);
27965 case Intrinsic::aarch64_sve_ldnt1:
27966 return performLDNT1Combine(N, DAG);
27967 case Intrinsic::aarch64_sve_ld1rq:
27969 case Intrinsic::aarch64_sve_ld1ro:
27971 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
27972 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27973 case Intrinsic::aarch64_sve_ldnt1_gather:
27974 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27975 case Intrinsic::aarch64_sve_ldnt1_gather_index:
27976 return performGatherLoadCombine(N, DAG,
27977 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
27978 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
27979 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27980 case Intrinsic::aarch64_sve_ld1:
27981 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
27982 case Intrinsic::aarch64_sve_ldnf1:
27983 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
27984 case Intrinsic::aarch64_sve_ldff1:
27985 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
27986 case Intrinsic::aarch64_sve_st1:
27987 return performST1Combine(N, DAG);
27988 case Intrinsic::aarch64_sve_stnt1:
27989 return performSTNT1Combine(N, DAG);
27990 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
27991 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27992 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
27993 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27994 case Intrinsic::aarch64_sve_stnt1_scatter:
27995 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27996 case Intrinsic::aarch64_sve_stnt1_scatter_index:
27997 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
27998 case Intrinsic::aarch64_sve_ld1_gather:
27999 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
28000 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
28001 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
28002 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
28003 case Intrinsic::aarch64_sve_ld1q_gather_index:
28004 return performGatherLoadCombine(N, DAG,
28005 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
28006 case Intrinsic::aarch64_sve_ld1_gather_index:
28007 return performGatherLoadCombine(N, DAG,
28008 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
28009 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
28010 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
28011 /*OnlyPackedOffsets=*/false);
28012 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
28013 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
28014 /*OnlyPackedOffsets=*/false);
28015 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
28016 return performGatherLoadCombine(N, DAG,
28017 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
28018 /*OnlyPackedOffsets=*/false);
28019 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
28020 return performGatherLoadCombine(N, DAG,
28021 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
28022 /*OnlyPackedOffsets=*/false);
28023 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
28024 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
28025 case Intrinsic::aarch64_sve_ldff1_gather:
28026 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
28027 case Intrinsic::aarch64_sve_ldff1_gather_index:
28028 return performGatherLoadCombine(N, DAG,
28029 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
28030 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
28031 return performGatherLoadCombine(N, DAG,
28032 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
28033 /*OnlyPackedOffsets=*/false);
28034 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
28035 return performGatherLoadCombine(N, DAG,
28036 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
28037 /*OnlyPackedOffsets=*/false);
28038 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
28039 return performGatherLoadCombine(N, DAG,
28040 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
28041 /*OnlyPackedOffsets=*/false);
28042 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
28043 return performGatherLoadCombine(N, DAG,
28044 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
28045 /*OnlyPackedOffsets=*/false);
28046 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
28047 return performGatherLoadCombine(N, DAG,
28048 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
28049 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
28050 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
28051 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
28052 case Intrinsic::aarch64_sve_st1q_scatter_index:
28053 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
28054 case Intrinsic::aarch64_sve_st1_scatter:
28055 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
28056 case Intrinsic::aarch64_sve_st1_scatter_index:
28057 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
28058 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
28059 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
28060 /*OnlyPackedOffsets=*/false);
28061 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
28062 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
28063 /*OnlyPackedOffsets=*/false);
28064 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
28065 return performScatterStoreCombine(N, DAG,
28066 AArch64ISD::SST1_SXTW_SCALED_PRED,
28067 /*OnlyPackedOffsets=*/false);
28068 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
28069 return performScatterStoreCombine(N, DAG,
28070 AArch64ISD::SST1_UXTW_SCALED_PRED,
28071 /*OnlyPackedOffsets=*/false);
28072 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
28073 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
28074 case Intrinsic::aarch64_rndr:
28075 case Intrinsic::aarch64_rndrrs:
28076 return performRNDRCombine(N, DAG);
28077 case Intrinsic::aarch64_sme_ldr_zt:
28078 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
28079 DAG.getVTList(MVT::Other), N->getOperand(0),
28080 N->getOperand(2), N->getOperand(3));
28081 case Intrinsic::aarch64_sme_str_zt:
28082 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
28083 DAG.getVTList(MVT::Other), N->getOperand(0),
28084 N->getOperand(2), N->getOperand(3));
28085 default:
28086 break;
28087 }
28088 break;
28089 case ISD::GlobalAddress:
28090 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
28091 case ISD::CTLZ:
28092 return performCTLZCombine(N, DAG, Subtarget);
28094 return performScalarToVectorCombine(N, DCI, DAG);
28095 case ISD::SHL:
28096 return performSHLCombine(N, DCI, DAG);
28097 }
28098 return SDValue();
28099}
28100
28101// Check if the return value is used as only a return value, as otherwise
28102// we can't perform a tail-call. In particular, we need to check for
28103// target ISD nodes that are returns and any other "odd" constructs
28104// that the generic analysis code won't necessarily catch.
28105bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
28106 SDValue &Chain) const {
28107 if (N->getNumValues() != 1)
28108 return false;
28109 if (!N->hasNUsesOfValue(1, 0))
28110 return false;
28111
28112 SDValue TCChain = Chain;
28113 SDNode *Copy = *N->user_begin();
28114 if (Copy->getOpcode() == ISD::CopyToReg) {
28115 // If the copy has a glue operand, we conservatively assume it isn't safe to
28116 // perform a tail call.
28117 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
28118 MVT::Glue)
28119 return false;
28120 TCChain = Copy->getOperand(0);
28121 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
28122 return false;
28123
28124 bool HasRet = false;
28125 for (SDNode *Node : Copy->users()) {
28126 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
28127 return false;
28128 HasRet = true;
28129 }
28130
28131 if (!HasRet)
28132 return false;
28133
28134 Chain = TCChain;
28135 return true;
28136}
28137
28138// Return whether the an instruction can potentially be optimized to a tail
28139// call. This will cause the optimizers to attempt to move, or duplicate,
28140// return instructions to help enable tail call optimizations for this
28141// instruction.
28142bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
28143 return CI->isTailCall();
28144}
28145
28146bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
28147 Register Offset, bool IsPre,
28148 MachineRegisterInfo &MRI) const {
28149 auto CstOffset = getIConstantVRegVal(Offset, MRI);
28150 if (!CstOffset || CstOffset->isZero())
28151 return false;
28152
28153 // All of the indexed addressing mode instructions take a signed 9 bit
28154 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
28155 // encodes the sign/indexing direction.
28156 return isInt<9>(CstOffset->getSExtValue());
28157}
28158
28159bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
28160 SDValue &Base,
28161 SDValue &Offset,
28162 SelectionDAG &DAG) const {
28163 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
28164 return false;
28165
28166 // Non-null if there is exactly one user of the loaded value (ignoring chain).
28167 SDNode *ValOnlyUser = nullptr;
28168 for (SDUse &U : N->uses()) {
28169 if (U.getResNo() == 1)
28170 continue; // Ignore chain.
28171 if (ValOnlyUser == nullptr)
28172 ValOnlyUser = U.getUser();
28173 else {
28174 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
28175 break;
28176 }
28177 }
28178
28179 auto IsUndefOrZero = [](SDValue V) {
28180 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
28181 };
28182
28183 // If the only user of the value is a scalable vector splat, it is
28184 // preferable to do a replicating load (ld1r*).
28185 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
28186 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
28187 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
28188 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
28189 return false;
28190
28191 Base = Op->getOperand(0);
28192 // All of the indexed addressing mode instructions take a signed
28193 // 9 bit immediate offset.
28194 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
28195 int64_t RHSC = RHS->getSExtValue();
28196 if (Op->getOpcode() == ISD::SUB)
28197 RHSC = -(uint64_t)RHSC;
28198 if (!isInt<9>(RHSC))
28199 return false;
28200 // When big-endian VLD1/VST1 are used for vector load and store, and these
28201 // only allow an offset that's equal to the store size.
28202 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
28203 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
28204 (uint64_t)RHSC != MemType.getStoreSize())
28205 return false;
28206 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
28207 // when dealing with subtraction.
28208 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
28209 return true;
28210 }
28211 return false;
28212}
28213
28214bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
28215 SDValue &Offset,
28217 SelectionDAG &DAG) const {
28218 EVT VT;
28219 SDValue Ptr;
28220 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28221 VT = LD->getMemoryVT();
28222 Ptr = LD->getBasePtr();
28223 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28224 VT = ST->getMemoryVT();
28225 Ptr = ST->getBasePtr();
28226 } else
28227 return false;
28228
28229 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
28230 return false;
28231 AM = ISD::PRE_INC;
28232 return true;
28233}
28234
28235bool AArch64TargetLowering::getPostIndexedAddressParts(
28237 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
28238 EVT VT;
28239 SDValue Ptr;
28240 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28241 VT = LD->getMemoryVT();
28242 Ptr = LD->getBasePtr();
28243 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28244 VT = ST->getMemoryVT();
28245 Ptr = ST->getBasePtr();
28246 } else
28247 return false;
28248
28249 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
28250 return false;
28251 // Post-indexing updates the base, so it's not a valid transform
28252 // if that's not the same as the load's pointer.
28253 if (Ptr != Base)
28254 return false;
28255 AM = ISD::POST_INC;
28256 return true;
28257}
28258
28261 SelectionDAG &DAG) {
28262 SDLoc DL(N);
28263 SDValue Op = N->getOperand(0);
28264 EVT VT = N->getValueType(0);
28265 [[maybe_unused]] EVT SrcVT = Op.getValueType();
28266 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28267 "Must be bool vector.");
28268
28269 // Special handling for Clang's __builtin_convertvector. For vectors with <8
28270 // elements, it adds a vector concatenation with undef(s). If we encounter
28271 // this here, we can skip the concat.
28272 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
28273 bool AllUndef = true;
28274 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
28275 AllUndef &= Op.getOperand(I).isUndef();
28276
28277 if (AllUndef)
28278 Op = Op.getOperand(0);
28279 }
28280
28281 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
28282 if (VectorBits)
28283 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
28284}
28285
28288 SelectionDAG &DAG, EVT ExtendVT,
28289 EVT CastVT) {
28290 SDLoc DL(N);
28291 SDValue Op = N->getOperand(0);
28292 EVT VT = N->getValueType(0);
28293
28294 // Use SCALAR_TO_VECTOR for lane zero
28295 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
28296 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
28297 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
28298 Results.push_back(
28299 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
28300}
28301
28302void AArch64TargetLowering::ReplaceBITCASTResults(
28304 SDLoc DL(N);
28305 SDValue Op = N->getOperand(0);
28306 EVT VT = N->getValueType(0);
28307 EVT SrcVT = Op.getValueType();
28308
28309 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
28310 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
28311 return;
28312 }
28313
28314 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
28315 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
28316 return;
28317 }
28318
28319 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
28320 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
28321 return;
28322 }
28323
28324 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
28325 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
28326 "Expected fp->int bitcast!");
28327
28328 // Bitcasting between unpacked vector types of different element counts is
28329 // not a NOP because the live elements are laid out differently.
28330 // 01234567
28331 // e.g. nxv2i32 = XX??XX??
28332 // nxv4f16 = X?X?X?X?
28333 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
28334 return;
28335
28336 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
28337 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
28338 return;
28339 }
28340
28341 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28342 !VT.isVector())
28343 return replaceBoolVectorBitcast(N, Results, DAG);
28344
28345 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
28346 return;
28347
28348 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
28349 DAG.getUNDEF(MVT::i32), Op);
28350 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
28351 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
28352}
28353
28355 SelectionDAG &DAG,
28356 const AArch64Subtarget *Subtarget) {
28357 EVT VT = N->getValueType(0);
28358 if (!VT.is256BitVector() ||
28360 !N->getFlags().hasAllowReassociation()) ||
28361 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
28362 VT.getScalarType() == MVT::bf16)
28363 return;
28364
28365 SDValue X = N->getOperand(0);
28366 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
28367 if (!Shuf) {
28368 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
28369 X = N->getOperand(1);
28370 if (!Shuf)
28371 return;
28372 }
28373
28374 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
28375 return;
28376
28377 // Check the mask is 1,0,3,2,5,4,...
28378 ArrayRef<int> Mask = Shuf->getMask();
28379 for (int I = 0, E = Mask.size(); I < E; I++)
28380 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
28381 return;
28382
28383 SDLoc DL(N);
28384 auto LoHi = DAG.SplitVector(X, DL);
28385 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
28386 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
28387 LoHi.first, LoHi.second);
28388
28389 // Shuffle the elements back into order.
28390 SmallVector<int> NMask;
28391 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
28392 NMask.push_back(I);
28393 NMask.push_back(I);
28394 }
28395 Results.push_back(
28396 DAG.getVectorShuffle(VT, DL,
28397 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
28398 DAG.getUNDEF(LoHi.first.getValueType())),
28399 DAG.getUNDEF(VT), NMask));
28400}
28401
28404 SelectionDAG &DAG, unsigned InterOp,
28405 unsigned AcrossOp) {
28406 EVT LoVT, HiVT;
28407 SDValue Lo, Hi;
28408 SDLoc DL(N);
28409 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
28410 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28411 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
28412 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
28413 Results.push_back(SplitVal);
28414}
28415
28416void AArch64TargetLowering::ReplaceExtractSubVectorResults(
28418 SDValue In = N->getOperand(0);
28419 EVT InVT = In.getValueType();
28420
28421 // Common code will handle these just fine.
28422 if (!InVT.isScalableVector() || !InVT.isInteger())
28423 return;
28424
28425 SDLoc DL(N);
28426 EVT VT = N->getValueType(0);
28427
28428 // The following checks bail if this is not a halving operation.
28429
28430 ElementCount ResEC = VT.getVectorElementCount();
28431
28432 if (InVT.getVectorElementCount() != (ResEC * 2))
28433 return;
28434
28435 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
28436 if (!CIndex)
28437 return;
28438
28439 unsigned Index = CIndex->getZExtValue();
28440 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
28441 return;
28442
28443 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
28444 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28445
28446 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
28447 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
28448}
28449
28450void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
28452 assert((Subtarget->hasSVE2p1() ||
28453 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
28454 "Custom lower of get.active.lane.mask missing required feature.");
28455
28456 assert(N->getValueType(0) == MVT::nxv32i1 &&
28457 "Unexpected result type for get.active.lane.mask");
28458
28459 SDLoc DL(N);
28460 SDValue Idx = N->getOperand(0);
28461 SDValue TC = N->getOperand(1);
28462
28463 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
28464 "Unexpected operand type for get.active.lane.mask");
28465
28466 if (Idx.getValueType() != MVT::i64) {
28467 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
28468 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
28469 }
28470
28471 SDValue ID =
28472 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
28473 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
28474 auto WideMask =
28475 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
28476
28477 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
28478 {WideMask.getValue(0), WideMask.getValue(1)}));
28479}
28480
28481// Create an even/odd pair of X registers holding integer value V.
28483 SDLoc DL(V.getNode());
28484 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
28485 if (DAG.getDataLayout().isBigEndian())
28486 std::swap (VLo, VHi);
28487 SDValue RegClass =
28488 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
28489 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
28490 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
28491 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
28492 return SDValue(
28493 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
28494}
28495
28498 SelectionDAG &DAG,
28499 const AArch64Subtarget *Subtarget) {
28500 assert(N->getValueType(0) == MVT::i128 &&
28501 "AtomicCmpSwap on types less than 128 should be legal");
28502
28503 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28504 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
28505 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
28506 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
28507 SDValue Ops[] = {
28508 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
28509 createGPRPairNode(DAG, N->getOperand(3)), // Store value
28510 N->getOperand(1), // Ptr
28511 N->getOperand(0), // Chain in
28512 };
28513
28514 unsigned Opcode;
28515 switch (MemOp->getMergedOrdering()) {
28517 Opcode = AArch64::CASPX;
28518 break;
28520 Opcode = AArch64::CASPAX;
28521 break;
28523 Opcode = AArch64::CASPLX;
28524 break;
28527 Opcode = AArch64::CASPALX;
28528 break;
28529 default:
28530 llvm_unreachable("Unexpected ordering!");
28531 }
28532
28533 MachineSDNode *CmpSwap = DAG.getMachineNode(
28534 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
28535 DAG.setNodeMemRefs(CmpSwap, {MemOp});
28536
28537 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
28538 if (DAG.getDataLayout().isBigEndian())
28539 std::swap(SubReg1, SubReg2);
28540 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
28541 SDValue(CmpSwap, 0));
28542 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
28543 SDValue(CmpSwap, 0));
28544 Results.push_back(
28545 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28546 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
28547 return;
28548 }
28549
28550 unsigned Opcode;
28551 switch (MemOp->getMergedOrdering()) {
28553 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
28554 break;
28556 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
28557 break;
28559 Opcode = AArch64::CMP_SWAP_128_RELEASE;
28560 break;
28563 Opcode = AArch64::CMP_SWAP_128;
28564 break;
28565 default:
28566 llvm_unreachable("Unexpected ordering!");
28567 }
28568
28569 SDLoc DL(N);
28570 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
28571 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
28572 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
28573 New.first, New.second, N->getOperand(0)};
28574 SDNode *CmpSwap = DAG.getMachineNode(
28575 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
28576 Ops);
28577 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
28578
28579 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28580 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
28581 Results.push_back(SDValue(CmpSwap, 3));
28582}
28583
28584static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
28585 AtomicOrdering Ordering) {
28586 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
28587 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
28588 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
28589 // ATOMIC_LOAD_CLR at any point.
28590 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
28591 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
28592 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
28593 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
28594
28595 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28596 // The operand will need to be XORed in a separate step.
28597 switch (Ordering) {
28599 return AArch64::LDCLRP;
28600 break;
28602 return AArch64::LDCLRPA;
28603 break;
28605 return AArch64::LDCLRPL;
28606 break;
28609 return AArch64::LDCLRPAL;
28610 break;
28611 default:
28612 llvm_unreachable("Unexpected ordering!");
28613 }
28614 }
28615
28616 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
28617 switch (Ordering) {
28619 return AArch64::LDSETP;
28620 break;
28622 return AArch64::LDSETPA;
28623 break;
28625 return AArch64::LDSETPL;
28626 break;
28629 return AArch64::LDSETPAL;
28630 break;
28631 default:
28632 llvm_unreachable("Unexpected ordering!");
28633 }
28634 }
28635
28636 if (ISDOpcode == ISD::ATOMIC_SWAP) {
28637 switch (Ordering) {
28639 return AArch64::SWPP;
28640 break;
28642 return AArch64::SWPPA;
28643 break;
28645 return AArch64::SWPPL;
28646 break;
28649 return AArch64::SWPPAL;
28650 break;
28651 default:
28652 llvm_unreachable("Unexpected ordering!");
28653 }
28654 }
28655
28656 llvm_unreachable("Unexpected ISDOpcode!");
28657}
28658
28661 SelectionDAG &DAG,
28662 const AArch64Subtarget *Subtarget) {
28663 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
28664 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
28665 // rather than the CASP instructions, because CASP has register classes for
28666 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
28667 // to present them as single operands. LSE128 instructions use the GPR64
28668 // register class (because the pair does not have to be sequential), like
28669 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
28670
28671 assert(N->getValueType(0) == MVT::i128 &&
28672 "AtomicLoadXXX on types less than 128 should be legal");
28673
28674 if (!Subtarget->hasLSE128())
28675 return;
28676
28677 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28678 const SDValue &Chain = N->getOperand(0);
28679 const SDValue &Ptr = N->getOperand(1);
28680 const SDValue &Val128 = N->getOperand(2);
28681 std::pair<SDValue, SDValue> Val2x64 =
28682 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
28683
28684 const unsigned ISDOpcode = N->getOpcode();
28685 const unsigned MachineOpcode =
28686 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
28687
28688 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28689 SDLoc DL(Val128);
28690 Val2x64.first =
28691 DAG.getNode(ISD::XOR, DL, MVT::i64,
28692 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
28693 Val2x64.second =
28694 DAG.getNode(ISD::XOR, DL, MVT::i64,
28695 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
28696 }
28697
28698 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
28699 if (DAG.getDataLayout().isBigEndian())
28700 std::swap(Ops[0], Ops[1]);
28701
28702 MachineSDNode *AtomicInst =
28703 DAG.getMachineNode(MachineOpcode, SDLoc(N),
28704 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
28705
28706 DAG.setNodeMemRefs(AtomicInst, {MemOp});
28707
28708 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
28709 if (DAG.getDataLayout().isBigEndian())
28710 std::swap(Lo, Hi);
28711
28712 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28713 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
28714}
28715
28716void AArch64TargetLowering::ReplaceNodeResults(
28718 switch (N->getOpcode()) {
28719 default:
28720 llvm_unreachable("Don't know how to custom expand this");
28721 case ISD::BITCAST:
28722 ReplaceBITCASTResults(N, Results, DAG);
28723 return;
28724 case ISD::VECREDUCE_ADD:
28725 case ISD::VECREDUCE_SMAX:
28726 case ISD::VECREDUCE_SMIN:
28727 case ISD::VECREDUCE_UMAX:
28728 case ISD::VECREDUCE_UMIN:
28729 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
28730 return;
28732 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
28733 Results.push_back(Res);
28734 return;
28735 case ISD::ADD:
28736 case ISD::FADD:
28737 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
28738 return;
28739
28740 case ISD::CTPOP:
28741 case ISD::PARITY:
28742 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
28743 Results.push_back(Result);
28744 return;
28745 case AArch64ISD::SADDV:
28746 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
28747 return;
28748 case AArch64ISD::UADDV:
28749 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
28750 return;
28751 case AArch64ISD::SMINV:
28752 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
28753 return;
28754 case AArch64ISD::UMINV:
28755 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
28756 return;
28757 case AArch64ISD::SMAXV:
28758 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
28759 return;
28760 case AArch64ISD::UMAXV:
28761 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
28762 return;
28763 case ISD::MULHS:
28765 Results.push_back(
28766 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
28767 return;
28768 case ISD::MULHU:
28770 Results.push_back(
28771 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
28772 return;
28773 case ISD::FP_TO_UINT:
28774 case ISD::FP_TO_SINT:
28777 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
28778 // Let normal code take care of it by not adding anything to Results.
28779 return;
28780 case ISD::ATOMIC_CMP_SWAP:
28781 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
28782 return;
28783 case ISD::ATOMIC_LOAD_CLR:
28784 assert(N->getValueType(0) != MVT::i128 &&
28785 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
28786 break;
28787 case ISD::ATOMIC_LOAD_AND:
28788 case ISD::ATOMIC_LOAD_OR:
28789 case ISD::ATOMIC_SWAP: {
28790 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
28791 "Expected 128-bit atomicrmw.");
28792 // These need custom type legalisation so we go directly to instruction.
28793 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
28794 return;
28795 }
28796 case ISD::ADDRSPACECAST: {
28797 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
28798 Results.push_back(V);
28799 return;
28800 }
28801 case ISD::ATOMIC_LOAD:
28802 case ISD::LOAD: {
28803 MemSDNode *LoadNode = cast<MemSDNode>(N);
28804 EVT MemVT = LoadNode->getMemoryVT();
28805 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
28806 // targets.
28807 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
28808 MemVT.getSizeInBits() == 256u &&
28809 (MemVT.getScalarSizeInBits() == 8u ||
28810 MemVT.getScalarSizeInBits() == 16u ||
28811 MemVT.getScalarSizeInBits() == 32u ||
28812 MemVT.getScalarSizeInBits() == 64u)) {
28813
28814 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
28816 AArch64ISD::LDNP, SDLoc(N),
28817 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
28818 {LoadNode->getChain(), LoadNode->getBasePtr()},
28819 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28820
28821 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
28822 DAG.getBitcast(HalfVT, Result.getValue(0)),
28823 DAG.getBitcast(HalfVT, Result.getValue(1)));
28824 Results.append({Pair, Result.getValue(2) /* Chain */});
28825 return;
28826 }
28827
28828 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
28829 LoadNode->getMemoryVT() != MVT::i128) {
28830 // Non-volatile or atomic loads are optimized later in AArch64's load/store
28831 // optimizer.
28832 return;
28833 }
28834
28835 if (SDValue(N, 0).getValueType() == MVT::i128) {
28836 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
28837 bool isLoadAcquire =
28839 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
28840
28841 if (isLoadAcquire)
28842 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
28843
28845 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28846 {LoadNode->getChain(), LoadNode->getBasePtr()},
28847 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28848
28849 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
28850
28851 SDValue Pair =
28852 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
28853 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
28854 Results.append({Pair, Result.getValue(2) /* Chain */});
28855 }
28856 return;
28857 }
28859 ReplaceExtractSubVectorResults(N, Results, DAG);
28860 return;
28863 // Custom lowering has been requested for INSERT_SUBVECTOR and
28864 // CONCAT_VECTORS -- but delegate to common code for result type
28865 // legalisation
28866 return;
28867 case ISD::GET_ACTIVE_LANE_MASK:
28868 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
28869 return;
28871 EVT VT = N->getValueType(0);
28872
28873 Intrinsic::ID IntID =
28874 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
28875 switch (IntID) {
28876 default:
28877 return;
28878 case Intrinsic::aarch64_sve_clasta_n: {
28879 assert((VT == MVT::i8 || VT == MVT::i16) &&
28880 "custom lowering for unexpected type");
28881 SDLoc DL(N);
28882 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28883 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
28884 N->getOperand(1), Op2, N->getOperand(3));
28885 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28886 return;
28887 }
28888 case Intrinsic::aarch64_sve_clastb_n: {
28889 assert((VT == MVT::i8 || VT == MVT::i16) &&
28890 "custom lowering for unexpected type");
28891 SDLoc DL(N);
28892 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28893 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
28894 N->getOperand(1), Op2, N->getOperand(3));
28895 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28896 return;
28897 }
28898 case Intrinsic::aarch64_sve_lasta: {
28899 assert((VT == MVT::i8 || VT == MVT::i16) &&
28900 "custom lowering for unexpected type");
28901 SDLoc DL(N);
28902 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
28903 N->getOperand(1), N->getOperand(2));
28904 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28905 return;
28906 }
28907 case Intrinsic::aarch64_sve_lastb: {
28908 assert((VT == MVT::i8 || VT == MVT::i16) &&
28909 "custom lowering for unexpected type");
28910 SDLoc DL(N);
28911 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
28912 N->getOperand(1), N->getOperand(2));
28913 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28914 return;
28915 }
28916 case Intrinsic::aarch64_sme_in_streaming_mode: {
28917 SDLoc DL(N);
28918 SDValue Chain = DAG.getEntryNode();
28919
28920 SDValue RuntimePStateSM =
28921 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
28922 Results.push_back(
28923 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
28924 return;
28925 }
28926 case Intrinsic::experimental_vector_match: {
28927 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
28928 return;
28929
28930 // NOTE: Only trivial type promotion is supported.
28931 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
28932 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
28933 return;
28934
28935 SDLoc DL(N);
28936 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
28937 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28938 return;
28939 }
28940 }
28941 }
28942 case ISD::READ_REGISTER: {
28943 SDLoc DL(N);
28944 assert(N->getValueType(0) == MVT::i128 &&
28945 "READ_REGISTER custom lowering is only for 128-bit sysregs");
28946 SDValue Chain = N->getOperand(0);
28947 SDValue SysRegName = N->getOperand(1);
28948
28949 SDValue Result = DAG.getNode(
28950 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28951 Chain, SysRegName);
28952
28953 // Sysregs are not endian. Result.getValue(0) always contains the lower half
28954 // of the 128-bit System Register value.
28955 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28956 Result.getValue(0), Result.getValue(1));
28957 Results.push_back(Pair);
28958 Results.push_back(Result.getValue(2)); // Chain
28959 return;
28960 }
28961 }
28962}
28963
28965 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
28967 return true;
28968}
28969
28971 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
28972 // reciprocal if there are three or more FDIVs.
28973 return 3;
28974}
28975
28978 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
28979 // v4i16, v2i32 instead of to promote.
28980 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
28981 VT == MVT::v1f32)
28982 return TypeWidenVector;
28983
28985}
28986
28987// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
28988// provided the address is 16-byte aligned.
28990 if (!Subtarget->hasLSE2())
28991 return false;
28992
28993 if (auto LI = dyn_cast<LoadInst>(I))
28994 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28995 LI->getAlign() >= Align(16);
28996
28997 if (auto SI = dyn_cast<StoreInst>(I))
28998 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28999 SI->getAlign() >= Align(16);
29000
29001 return false;
29002}
29003
29005 if (!Subtarget->hasLSE128())
29006 return false;
29007
29008 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
29009 // will clobber the two registers.
29010 if (const auto *SI = dyn_cast<StoreInst>(I))
29011 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29012 SI->getAlign() >= Align(16) &&
29013 (SI->getOrdering() == AtomicOrdering::Release ||
29014 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
29015
29016 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
29017 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29018 RMW->getAlign() >= Align(16) &&
29019 (RMW->getOperation() == AtomicRMWInst::Xchg ||
29020 RMW->getOperation() == AtomicRMWInst::And ||
29021 RMW->getOperation() == AtomicRMWInst::Or);
29022
29023 return false;
29024}
29025
29027 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
29028 return false;
29029
29030 if (auto LI = dyn_cast<LoadInst>(I))
29031 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
29032 LI->getAlign() >= Align(16) &&
29033 LI->getOrdering() == AtomicOrdering::Acquire;
29034
29035 if (auto SI = dyn_cast<StoreInst>(I))
29036 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29037 SI->getAlign() >= Align(16) &&
29038 SI->getOrdering() == AtomicOrdering::Release;
29039
29040 return false;
29041}
29042
29044 const Instruction *I) const {
29046 return false;
29048 return false;
29050 return true;
29051 return false;
29052}
29053
29055 const Instruction *I) const {
29056 // Store-Release instructions only provide seq_cst guarantees when paired with
29057 // Load-Acquire instructions. MSVC CRT does not use these instructions to
29058 // implement seq_cst loads and stores, so we need additional explicit fences
29059 // after memory writes.
29060 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
29061 return false;
29062
29063 switch (I->getOpcode()) {
29064 default:
29065 return false;
29066 case Instruction::AtomicCmpXchg:
29067 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
29069 case Instruction::AtomicRMW:
29070 return cast<AtomicRMWInst>(I)->getOrdering() ==
29072 case Instruction::Store:
29073 return cast<StoreInst>(I)->getOrdering() ==
29075 }
29076}
29077
29078// Loads and stores less than 128-bits are already atomic; ones above that
29079// are doomed anyway, so defer to the default libcall and blame the OS when
29080// things go wrong.
29083 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
29084 if (Size != 128)
29093}
29094
29095// Loads and stores less than 128-bits are already atomic; ones above that
29096// are doomed anyway, so defer to the default libcall and blame the OS when
29097// things go wrong.
29100 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
29101
29102 if (Size != 128)
29104 if (isOpSuitableForRCPC3(LI))
29106 // No LSE128 loads
29107 if (isOpSuitableForLDPSTP(LI))
29109
29110 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29111 // implement atomicrmw without spilling. If the target address is also on the
29112 // stack and close enough to the spill slot, this can lead to a situation
29113 // where the monitor always gets cleared and the atomic operation can never
29114 // succeed. So at -O0 lower this operation to a CAS loop.
29115 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29117
29118 // Using CAS for an atomic load has a better chance of succeeding under high
29119 // contention situations. So use it if available.
29120 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
29122}
29123
29124// Return true if the atomic operation expansion will lower to use a library
29125// call, and is thus ineligible to use an LLSC expansion.
29126static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
29127 const AtomicRMWInst *RMW) {
29128 if (!RMW->isFloatingPointOperation())
29129 return false;
29130 switch (RMW->getType()->getScalarType()->getTypeID()) {
29131 case Type::FloatTyID:
29132 case Type::DoubleTyID:
29133 case Type::HalfTyID:
29134 case Type::BFloatTyID:
29135 // Will use soft float
29136 return !Subtarget.hasFPARMv8();
29137 default:
29138 // fp128 will emit library calls.
29139 return true;
29140 }
29141
29142 llvm_unreachable("covered type switch");
29143}
29144
29145// The "default" for integer RMW operations is to expand to an LL/SC loop.
29146// However, with the LSE instructions (or outline-atomics mode, which provides
29147// library routines in place of the LSE-instructions), we can directly emit many
29148// operations instead.
29151 Type *Ty = AI->getType();
29152 unsigned Size = Ty->getPrimitiveSizeInBits();
29153 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
29154
29155 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
29159 if (CanUseLSE128)
29161
29162 // If LSFE available, use atomic FP instructions in preference to expansion
29163 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
29169
29170 // Nand is not supported in LSE.
29171 // Leave 128 bits to LLSC or CmpXChg.
29172 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
29173 !AI->isFloatingPointOperation()) {
29174 if (Subtarget->hasLSE())
29176 if (Subtarget->outlineAtomics()) {
29177 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
29178 // Don't outline them unless
29179 // (1) high level <atomic> support approved:
29180 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
29181 // (2) low level libgcc and compiler-rt support implemented by:
29182 // min/max outline atomics helpers
29183 if (AI->getOperation() != AtomicRMWInst::Min &&
29188 }
29189 }
29190 }
29191
29192 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29193 // implement atomicrmw without spilling. If the target address is also on the
29194 // stack and close enough to the spill slot, this can lead to a situation
29195 // where the monitor always gets cleared and the atomic operation can never
29196 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
29197 // we have a single CAS instruction that can replace the loop.
29198 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
29199 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
29201
29203}
29204
29207 AtomicCmpXchgInst *AI) const {
29208 // If subtarget has LSE, leave cmpxchg intact for codegen.
29209 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
29211 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29212 // implement cmpxchg without spilling. If the address being exchanged is also
29213 // on the stack and close enough to the spill slot, this can lead to a
29214 // situation where the monitor always gets cleared and the atomic operation
29215 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
29216 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29218
29219 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
29220 // it.
29222 if (Size > 64)
29224
29226}
29227
29229 Type *ValueTy, Value *Addr,
29230 AtomicOrdering Ord) const {
29231 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29232 bool IsAcquire = isAcquireOrStronger(Ord);
29233
29234 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
29235 // intrinsic must return {i64, i64} and we have to recombine them into a
29236 // single i128 here.
29237 if (ValueTy->getPrimitiveSizeInBits() == 128) {
29239 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
29240
29241 Value *LoHi =
29242 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
29243
29244 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
29245 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
29246
29247 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
29248 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
29249 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
29250
29251 Value *Or = Builder.CreateOr(
29252 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
29253 return Builder.CreateBitCast(Or, ValueTy);
29254 }
29255
29256 Type *Tys[] = { Addr->getType() };
29258 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
29259
29260 const DataLayout &DL = M->getDataLayout();
29261 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
29262 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
29263 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
29264 Attribute::ElementType, IntEltTy));
29265 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
29266
29267 return Builder.CreateBitCast(Trunc, ValueTy);
29268}
29269
29271 IRBuilderBase &Builder) const {
29272 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
29273}
29274
29276 Value *Val, Value *Addr,
29277 AtomicOrdering Ord) const {
29278 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29279 bool IsRelease = isReleaseOrStronger(Ord);
29280
29281 // Since the intrinsics must have legal type, the i128 intrinsics take two
29282 // parameters: "i64, i64". We must marshal Val into the appropriate form
29283 // before the call.
29284 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
29286 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
29288 Type *Int64Ty = Type::getInt64Ty(M->getContext());
29289 Type *Int128Ty = Type::getInt128Ty(M->getContext());
29290
29291 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
29292
29293 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
29294 Value *Hi =
29295 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
29296 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
29297 }
29298
29300 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
29301 Type *Tys[] = { Addr->getType() };
29303
29304 const DataLayout &DL = M->getDataLayout();
29305 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
29306 Val = Builder.CreateBitCast(Val, IntValTy);
29307
29308 CallInst *CI = Builder.CreateCall(
29309 Stxr, {Builder.CreateZExtOrBitCast(
29310 Val, Stxr->getFunctionType()->getParamType(0)),
29311 Addr});
29312 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
29313 Attribute::ElementType, Val->getType()));
29314 return CI;
29315}
29316
29318 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
29319 const DataLayout &DL) const {
29320 if (!Ty->isArrayTy()) {
29321 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
29322 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
29323 }
29324
29325 // All non aggregate members of the type must have the same type
29326 SmallVector<EVT> ValueVTs;
29327 ComputeValueVTs(*this, DL, Ty, ValueVTs);
29328 return all_equal(ValueVTs);
29329}
29330
29331bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
29332 EVT) const {
29333 return false;
29334}
29335
29336static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
29337 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
29338 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
29339 M, Intrinsic::thread_pointer, IRB.getPtrTy());
29340 return IRB.CreatePointerCast(
29341 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
29342 Offset),
29343 IRB.getPtrTy(0));
29344}
29345
29347 // Android provides a fixed TLS slot for the stack cookie. See the definition
29348 // of TLS_SLOT_STACK_GUARD in
29349 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
29350 if (Subtarget->isTargetAndroid())
29351 return UseTlsOffset(IRB, 0x28);
29352
29353 // Fuchsia is similar.
29354 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
29355 if (Subtarget->isTargetFuchsia())
29356 return UseTlsOffset(IRB, -0x10);
29357
29359}
29360
29362 // MSVC CRT provides functionalities for stack protection.
29363 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29364 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29365
29366 RTLIB::LibcallImpl SecurityCookieVar =
29367 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
29368 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
29369 SecurityCookieVar != RTLIB::Unsupported) {
29370 // MSVC CRT has a global variable holding security cookie.
29371 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
29372 PointerType::getUnqual(M.getContext()));
29373
29374 // MSVC CRT has a function to validate security cookie.
29375 FunctionCallee SecurityCheckCookie =
29376 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
29377 Type::getVoidTy(M.getContext()),
29378 PointerType::getUnqual(M.getContext()));
29379 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
29380 F->setCallingConv(CallingConv::Win64);
29381 F->addParamAttr(0, Attribute::AttrKind::InReg);
29382 }
29383 return;
29384 }
29386}
29387
29389 // MSVC CRT has a function to validate security cookie.
29390 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29391 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29392 if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
29393 return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
29395}
29396
29397Value *
29399 // Android provides a fixed TLS slot for the SafeStack pointer. See the
29400 // definition of TLS_SLOT_SAFESTACK in
29401 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
29402 if (Subtarget->isTargetAndroid())
29403 return UseTlsOffset(IRB, 0x48);
29404
29405 // Fuchsia is similar.
29406 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
29407 if (Subtarget->isTargetFuchsia())
29408 return UseTlsOffset(IRB, -0x8);
29409
29411}
29412
29413/// If a physical register, this returns the register that receives the
29414/// exception address on entry to an EH pad.
29416 const Constant *PersonalityFn) const {
29417 // FIXME: This is a guess. Has this been defined yet?
29418 return AArch64::X0;
29419}
29420
29421/// If a physical register, this returns the register that receives the
29422/// exception typeid on entry to a landing pad.
29424 const Constant *PersonalityFn) const {
29425 // FIXME: This is a guess. Has this been defined yet?
29426 return AArch64::X1;
29427}
29428
29430 const Instruction &AndI) const {
29431 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
29432 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
29433 // may be beneficial to sink in other cases, but we would have to check that
29434 // the cmp would not get folded into the br to form a cbz for these to be
29435 // beneficial.
29437 if (!Mask)
29438 return false;
29439 return Mask->getValue().isPowerOf2();
29440}
29441
29445 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
29446 SelectionDAG &DAG) const {
29447 // Does baseline recommend not to perform the fold by default?
29449 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
29450 return false;
29451 // Else, if this is a vector shift, prefer 'shl'.
29452 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
29453}
29454
29457 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
29459 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
29462 ExpansionFactor);
29463}
29464
29466 // Update IsSplitCSR in AArch64unctionInfo.
29467 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
29468 AFI->setIsSplitCSR(true);
29469}
29470
29472 MachineBasicBlock *Entry,
29473 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
29474 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
29475 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
29476 if (!IStart)
29477 return;
29478
29479 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
29480 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
29481 MachineBasicBlock::iterator MBBI = Entry->begin();
29482 for (const MCPhysReg *I = IStart; *I; ++I) {
29483 const TargetRegisterClass *RC = nullptr;
29484 if (AArch64::GPR64RegClass.contains(*I))
29485 RC = &AArch64::GPR64RegClass;
29486 else if (AArch64::FPR64RegClass.contains(*I))
29487 RC = &AArch64::FPR64RegClass;
29488 else
29489 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
29490
29491 Register NewVR = MRI->createVirtualRegister(RC);
29492 // Create copy from CSR to a virtual register.
29493 // FIXME: this currently does not emit CFI pseudo-instructions, it works
29494 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
29495 // nounwind. If we want to generalize this later, we may need to emit
29496 // CFI pseudo-instructions.
29497 assert(Entry->getParent()->getFunction().hasFnAttribute(
29498 Attribute::NoUnwind) &&
29499 "Function should be nounwind in insertCopiesSplitCSR!");
29500 Entry->addLiveIn(*I);
29501 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
29502 .addReg(*I);
29503
29504 // Insert the copy-back instructions right before the terminator.
29505 for (auto *Exit : Exits)
29506 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
29507 TII->get(TargetOpcode::COPY), *I)
29508 .addReg(NewVR);
29509 }
29510}
29511
29512bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
29513 // Integer division on AArch64 is expensive. However, when aggressively
29514 // optimizing for code size, we prefer to use a div instruction, as it is
29515 // usually smaller than the alternative sequence.
29516 // The exception to this is vector division. Since AArch64 doesn't have vector
29517 // integer division, leaving the division as-is is a loss even in terms of
29518 // size, because it will have to be scalarized, while the alternative code
29519 // sequence can be performed in vector form.
29520 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
29521 return OptSize && !VT.isVector();
29522}
29523
29525 const MachineFunction &MF) const {
29526 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
29527 // In future, we could allow this when SVE is available, but currently,
29528 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
29529 // the general lowering may introduce stack spills/reloads).
29530 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
29531 return false;
29532
29533 // Do not merge to float value size (128 bytes) if no implicit float attribute
29534 // is set.
29535 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
29536 return !NoFloat || MemVT.getSizeInBits() <= 64;
29537}
29538
29540 // We want inc-of-add for scalars and sub-of-not for vectors.
29541 return VT.isScalarInteger();
29542}
29543
29545 EVT VT) const {
29546 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
29547 // legalize.
29548 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
29549 return false;
29550 if (FPVT == MVT::v8bf16)
29551 return false;
29552 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
29553}
29554
29556 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
29557 // avoid vselect becoming bsl / unrolling.
29558 return !VT.isFixedLengthVector();
29559}
29560
29564 const TargetInstrInfo *TII) const {
29565 assert(MBBI->isCall() && MBBI->getCFIType() &&
29566 "Invalid call instruction for a KCFI check");
29567
29568 switch (MBBI->getOpcode()) {
29569 case AArch64::BLR:
29570 case AArch64::BLRNoIP:
29571 case AArch64::TCRETURNri:
29572 case AArch64::TCRETURNrix16x17:
29573 case AArch64::TCRETURNrix17:
29574 case AArch64::TCRETURNrinotx16:
29575 break;
29576 default:
29577 llvm_unreachable("Unexpected CFI call opcode");
29578 }
29579
29580 MachineOperand &Target = MBBI->getOperand(0);
29581 assert(Target.isReg() && "Invalid target operand for an indirect call");
29582 Target.setIsRenamable(false);
29583
29584 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
29585 .addReg(Target.getReg())
29586 .addImm(MBBI->getCFIType())
29587 .getInstr();
29588}
29589
29591 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
29592}
29593
29594unsigned
29596 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
29597 return getPointerTy(DL).getSizeInBits();
29598
29599 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
29600}
29601
29602void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
29603 MachineFrameInfo &MFI = MF.getFrameInfo();
29604 // If we have any vulnerable SVE stack objects then the stack protector
29605 // needs to be placed at the top of the SVE stack area, as the SVE locals
29606 // are placed above the other locals, so we allocate it as if it were a
29607 // scalable vector.
29608 // FIXME: It may be worthwhile having a specific interface for this rather
29609 // than doing it here in finalizeLowering.
29610 if (MFI.hasStackProtectorIndex()) {
29611 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
29612 if (MFI.hasScalableStackID(i) &&
29617 break;
29618 }
29619 }
29620 }
29623}
29624
29625// Unlike X86, we let frame lowering assign offsets to all catch objects.
29627
29628bool AArch64TargetLowering::shouldLocalize(
29629 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
29630 auto &MF = *MI.getMF();
29631 auto &MRI = MF.getRegInfo();
29632 auto maxUses = [](unsigned RematCost) {
29633 // A cost of 1 means remats are basically free.
29634 if (RematCost == 1)
29635 return std::numeric_limits<unsigned>::max();
29636 if (RematCost == 2)
29637 return 2U;
29638
29639 // Remat is too expensive, only sink if there's one user.
29640 if (RematCost > 2)
29641 return 1U;
29642 llvm_unreachable("Unexpected remat cost");
29643 };
29644
29645 unsigned Opc = MI.getOpcode();
29646 switch (Opc) {
29647 case TargetOpcode::G_GLOBAL_VALUE: {
29648 // On Darwin, TLS global vars get selected into function calls, which
29649 // we don't want localized, as they can get moved into the middle of a
29650 // another call sequence.
29651 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
29652 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
29653 return false;
29654 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
29655 }
29656 case TargetOpcode::G_FCONSTANT:
29657 case TargetOpcode::G_CONSTANT: {
29658 const ConstantInt *CI;
29659 unsigned AdditionalCost = 0;
29660
29661 if (Opc == TargetOpcode::G_CONSTANT)
29662 CI = MI.getOperand(1).getCImm();
29663 else {
29664 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29665 // We try to estimate cost of 32/64b fpimms, as they'll likely be
29666 // materialized as integers.
29667 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
29668 break;
29669 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
29670 bool OptForSize = MF.getFunction().hasOptSize();
29672 OptForSize))
29673 return true; // Constant should be cheap.
29674 CI =
29675 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
29676 // FP materialization also costs an extra move, from gpr to fpr.
29677 AdditionalCost = 1;
29678 }
29679 APInt Imm = CI->getValue();
29682 assert(Cost.isValid() && "Expected a valid imm cost");
29683
29684 unsigned RematCost = Cost.getValue();
29685 RematCost += AdditionalCost;
29686 Register Reg = MI.getOperand(0).getReg();
29687 unsigned MaxUses = maxUses(RematCost);
29688 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
29689 if (MaxUses == std::numeric_limits<unsigned>::max())
29690 --MaxUses;
29691 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
29692 }
29693 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
29694 // localizable.
29695 case AArch64::ADRP:
29696 case AArch64::G_ADD_LOW:
29697 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
29698 case TargetOpcode::G_PTR_ADD:
29699 return true;
29700 default:
29701 break;
29702 }
29704}
29705
29707 // Fallback for scalable vectors.
29708 // Note that if EnableSVEGISel is true, we allow scalable vector types for
29709 // all instructions, regardless of whether they are actually supported.
29710 if (!EnableSVEGISel) {
29711 if (Inst.getType()->isScalableTy()) {
29712 return true;
29713 }
29714
29715 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
29716 if (Inst.getOperand(i)->getType()->isScalableTy())
29717 return true;
29718
29719 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
29720 if (AI->getAllocatedType()->isScalableTy())
29721 return true;
29722 }
29723 }
29724
29725 // Checks to allow the use of SME instructions
29726 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
29727 auto CallAttrs = SMECallAttrs(*Base, this);
29728 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
29729 CallAttrs.requiresPreservingZT0() ||
29730 CallAttrs.requiresPreservingAllZAState())
29731 return true;
29732 }
29733 return false;
29734}
29735
29736// Return the largest legal scalable vector type that matches VT's element type.
29740 "Expected legal fixed length vector!");
29741 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29742 default:
29743 llvm_unreachable("unexpected element type for SVE container");
29744 case MVT::i8:
29745 return EVT(MVT::nxv16i8);
29746 case MVT::i16:
29747 return EVT(MVT::nxv8i16);
29748 case MVT::i32:
29749 return EVT(MVT::nxv4i32);
29750 case MVT::i64:
29751 return EVT(MVT::nxv2i64);
29752 case MVT::bf16:
29753 return EVT(MVT::nxv8bf16);
29754 case MVT::f16:
29755 return EVT(MVT::nxv8f16);
29756 case MVT::f32:
29757 return EVT(MVT::nxv4f32);
29758 case MVT::f64:
29759 return EVT(MVT::nxv2f64);
29760 }
29761}
29762
29763// Return a predicate with active lanes corresponding to the extent of VT.
29765 EVT VT) {
29768 "Expected legal fixed length vector!");
29769
29770 std::optional<unsigned> PgPattern =
29772 assert(PgPattern && "Unexpected element count for SVE predicate");
29773
29774 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
29775 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
29776 // variants of instructions when available.
29777 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29778 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29779 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29780 if (MaxSVESize && MinSVESize == MaxSVESize &&
29781 MaxSVESize == VT.getSizeInBits())
29782 PgPattern = AArch64SVEPredPattern::all;
29783
29784 MVT MaskVT;
29785 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29786 default:
29787 llvm_unreachable("unexpected element type for SVE predicate");
29788 case MVT::i8:
29789 MaskVT = MVT::nxv16i1;
29790 break;
29791 case MVT::i16:
29792 case MVT::f16:
29793 case MVT::bf16:
29794 MaskVT = MVT::nxv8i1;
29795 break;
29796 case MVT::i32:
29797 case MVT::f32:
29798 MaskVT = MVT::nxv4i1;
29799 break;
29800 case MVT::i64:
29801 case MVT::f64:
29802 MaskVT = MVT::nxv2i1;
29803 break;
29804 }
29805
29806 return getPTrue(DAG, DL, MaskVT, *PgPattern);
29807}
29808
29810 EVT VT) {
29812 "Expected legal scalable vector!");
29813 auto PredTy = VT.changeVectorElementType(MVT::i1);
29814 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
29815}
29816
29818 if (VT.isFixedLengthVector())
29819 return getPredicateForFixedLengthVector(DAG, DL, VT);
29820
29821 return getPredicateForScalableVector(DAG, DL, VT);
29822}
29823
29824// Grow V to consume an entire SVE register.
29826 assert(VT.isScalableVector() &&
29827 "Expected to convert into a scalable vector!");
29828 assert(V.getValueType().isFixedLengthVector() &&
29829 "Expected a fixed length vector operand!");
29830 SDLoc DL(V);
29831 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29832 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
29833}
29834
29835// Shrink V so it's just big enough to maintain a VT's worth of data.
29838 "Expected to convert into a fixed length vector!");
29839 assert(V.getValueType().isScalableVector() &&
29840 "Expected a scalable vector operand!");
29841 SDLoc DL(V);
29842 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29843 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
29844}
29845
29846// Convert all fixed length vector loads larger than NEON to masked_loads.
29847SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
29848 SDValue Op, SelectionDAG &DAG) const {
29849 auto Load = cast<LoadSDNode>(Op);
29850
29851 SDLoc DL(Op);
29852 EVT VT = Op.getValueType();
29853 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29854 EVT LoadVT = ContainerVT;
29855 EVT MemVT = Load->getMemoryVT();
29856
29857 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29858
29859 if (VT.isFloatingPoint()) {
29860 LoadVT = ContainerVT.changeTypeToInteger();
29861 MemVT = MemVT.changeTypeToInteger();
29862 }
29863
29864 SDValue NewLoad = DAG.getMaskedLoad(
29865 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
29866 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
29867 Load->getAddressingMode(), Load->getExtensionType());
29868
29869 SDValue Result = NewLoad;
29870 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
29871 EVT ExtendVT = ContainerVT.changeVectorElementType(
29872 Load->getMemoryVT().getVectorElementType());
29873
29874 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
29875 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29876 Pg, Result, DAG.getUNDEF(ContainerVT));
29877 } else if (VT.isFloatingPoint()) {
29878 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
29879 }
29880
29881 Result = convertFromScalableVector(DAG, VT, Result);
29882 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29883 return DAG.getMergeValues(MergedValues, DL);
29884}
29885
29887 SelectionDAG &DAG) {
29888 SDLoc DL(Mask);
29889 EVT InVT = Mask.getValueType();
29890 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29892
29893 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
29894 return Pg;
29895
29896 bool InvertCond = false;
29897 if (isBitwiseNot(Mask)) {
29898 InvertCond = true;
29899 Mask = Mask.getOperand(0);
29900 }
29901
29902 SDValue Op1, Op2;
29903 ISD::CondCode CC;
29904
29905 // When Mask is the result of a SETCC, it's better to regenerate the compare.
29906 if (Mask.getOpcode() == ISD::SETCC) {
29907 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
29908 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
29909 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
29910 } else {
29911 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
29912 Op2 = DAG.getConstant(0, DL, ContainerVT);
29913 CC = ISD::SETNE;
29914 }
29915
29916 if (InvertCond)
29917 CC = getSetCCInverse(CC, Op1.getValueType());
29918
29919 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
29920 {Pg, Op1, Op2, DAG.getCondCode(CC)});
29921}
29922
29923// Convert all fixed length vector loads larger than NEON to masked_loads.
29924SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
29925 SDValue Op, SelectionDAG &DAG) const {
29927
29928 SDLoc DL(Op);
29929 EVT VT = Op.getValueType();
29930 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29931
29932 SDValue Mask = Load->getMask();
29933 // If this is an extending load and the mask type is not the same as
29934 // load's type then we have to extend the mask type.
29935 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
29936 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
29937 "Incorrect mask type");
29938 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
29939 }
29941
29942 SDValue PassThru;
29943 bool IsPassThruZeroOrUndef = false;
29944
29945 if (Load->getPassThru()->isUndef()) {
29946 PassThru = DAG.getUNDEF(ContainerVT);
29947 IsPassThruZeroOrUndef = true;
29948 } else {
29949 if (ContainerVT.isInteger())
29950 PassThru = DAG.getConstant(0, DL, ContainerVT);
29951 else
29952 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
29953 if (isZerosVector(Load->getPassThru().getNode()))
29954 IsPassThruZeroOrUndef = true;
29955 }
29956
29957 SDValue NewLoad = DAG.getMaskedLoad(
29958 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
29959 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
29960 Load->getAddressingMode(), Load->getExtensionType());
29961
29962 SDValue Result = NewLoad;
29963 if (!IsPassThruZeroOrUndef) {
29964 SDValue OldPassThru =
29965 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
29966 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
29967 }
29968
29969 Result = convertFromScalableVector(DAG, VT, Result);
29970 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29971 return DAG.getMergeValues(MergedValues, DL);
29972}
29973
29974// Convert all fixed length vector stores larger than NEON to masked_stores.
29975SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
29976 SDValue Op, SelectionDAG &DAG) const {
29977 auto Store = cast<StoreSDNode>(Op);
29978
29979 SDLoc DL(Op);
29980 EVT VT = Store->getValue().getValueType();
29981 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29982 EVT MemVT = Store->getMemoryVT();
29983
29984 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29985 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29986
29987 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
29988 EVT TruncVT = ContainerVT.changeVectorElementType(
29989 Store->getMemoryVT().getVectorElementType());
29990 MemVT = MemVT.changeTypeToInteger();
29991 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
29992 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
29993 DAG.getUNDEF(TruncVT));
29994 NewValue =
29995 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29996 } else if (VT.isFloatingPoint()) {
29997 MemVT = MemVT.changeTypeToInteger();
29998 NewValue =
29999 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
30000 }
30001
30002 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
30003 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
30004 Store->getMemOperand(), Store->getAddressingMode(),
30005 Store->isTruncatingStore());
30006}
30007
30008SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
30009 SDValue Op, SelectionDAG &DAG) const {
30011
30012 SDLoc DL(Op);
30013 EVT VT = Store->getValue().getValueType();
30014 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30015
30016 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
30018
30019 return DAG.getMaskedStore(
30020 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
30021 Mask, Store->getMemoryVT(), Store->getMemOperand(),
30022 Store->getAddressingMode(), Store->isTruncatingStore());
30023}
30024
30025SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
30026 SDValue Op, SelectionDAG &DAG) const {
30027 SDLoc DL(Op);
30028 EVT VT = Op.getValueType();
30029 EVT EltVT = VT.getVectorElementType();
30030
30031 bool Signed = Op.getOpcode() == ISD::SDIV;
30032 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
30033
30034 bool Negated;
30035 uint64_t SplatVal;
30036 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
30037 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30038 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30039 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
30040
30042 SDValue Res =
30043 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
30044 if (Negated)
30045 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
30046 DAG.getConstant(0, DL, ContainerVT), Res);
30047
30048 return convertFromScalableVector(DAG, VT, Res);
30049 }
30050
30051 // Scalable vector i32/i64 DIV is supported.
30052 if (EltVT == MVT::i32 || EltVT == MVT::i64)
30053 return LowerToPredicatedOp(Op, DAG, PredOpcode);
30054
30055 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
30056 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
30057 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
30058 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30059
30060 // If the wider type is legal: extend, op, and truncate.
30061 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
30062 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
30063 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
30064 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
30065 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
30066 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
30067 }
30068
30069 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
30070 &ExtendOpcode](SDValue Op) {
30071 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
30072 SDValue IdxHalf =
30073 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
30074 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
30075 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
30076 return std::pair<SDValue, SDValue>(
30077 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
30078 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
30079 };
30080
30081 // If wider type is not legal: split, extend, op, trunc and concat.
30082 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
30083 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
30084 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
30085 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
30086 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
30087 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
30088 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
30089}
30090
30091SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
30092 SDValue Op, SelectionDAG &DAG) const {
30093 EVT VT = Op.getValueType();
30094 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30095
30096 SDLoc DL(Op);
30097 SDValue Val = Op.getOperand(0);
30098 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30099 Val = convertToScalableVector(DAG, ContainerVT, Val);
30100
30101 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
30102 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
30103
30104 // Repeatedly unpack Val until the result is of the desired element type.
30105 switch (ContainerVT.getSimpleVT().SimpleTy) {
30106 default:
30107 llvm_unreachable("unimplemented container type");
30108 case MVT::nxv16i8:
30109 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
30110 if (VT.getVectorElementType() == MVT::i16)
30111 break;
30112 [[fallthrough]];
30113 case MVT::nxv8i16:
30114 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
30115 if (VT.getVectorElementType() == MVT::i32)
30116 break;
30117 [[fallthrough]];
30118 case MVT::nxv4i32:
30119 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
30120 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
30121 break;
30122 }
30123
30124 return convertFromScalableVector(DAG, VT, Val);
30125}
30126
30127SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
30128 SDValue Op, SelectionDAG &DAG) const {
30129 EVT VT = Op.getValueType();
30130 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30131
30132 SDLoc DL(Op);
30133 SDValue Val = Op.getOperand(0);
30134 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30135 Val = convertToScalableVector(DAG, ContainerVT, Val);
30136
30137 // Repeatedly truncate Val until the result is of the desired element type.
30138 switch (ContainerVT.getSimpleVT().SimpleTy) {
30139 default:
30140 llvm_unreachable("unimplemented container type");
30141 case MVT::nxv2i64:
30142 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
30143 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
30144 if (VT.getVectorElementType() == MVT::i32)
30145 break;
30146 [[fallthrough]];
30147 case MVT::nxv4i32:
30148 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
30149 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
30150 if (VT.getVectorElementType() == MVT::i16)
30151 break;
30152 [[fallthrough]];
30153 case MVT::nxv8i16:
30154 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
30155 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
30156 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
30157 break;
30158 }
30159
30160 return convertFromScalableVector(DAG, VT, Val);
30161}
30162
30163SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
30164 SDValue Op, SelectionDAG &DAG) const {
30165 EVT VT = Op.getValueType();
30166 EVT InVT = Op.getOperand(0).getValueType();
30167 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
30168
30169 SDLoc DL(Op);
30170 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30171 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30172
30173 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
30174}
30175
30176SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
30177 SDValue Op, SelectionDAG &DAG) const {
30178 EVT VT = Op.getValueType();
30179 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30180
30181 SDLoc DL(Op);
30182 EVT InVT = Op.getOperand(0).getValueType();
30183 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30184 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30185
30186 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
30187 Op.getOperand(1), Op.getOperand(2));
30188
30189 return convertFromScalableVector(DAG, VT, ScalableRes);
30190}
30191
30192// Convert vector operation 'Op' to an equivalent predicated operation whereby
30193// the original operation's type is used to construct a suitable predicate.
30194// NOTE: The results for inactive lanes are undefined.
30195SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
30196 SelectionDAG &DAG,
30197 unsigned NewOp) const {
30198 EVT VT = Op.getValueType();
30199 SDLoc DL(Op);
30200 auto Pg = getPredicateForVector(DAG, DL, VT);
30201
30202 if (VT.isFixedLengthVector()) {
30203 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
30204 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30205
30206 // Create list of operands by converting existing ones to scalable types.
30208 for (const SDValue &V : Op->op_values()) {
30209 if (isa<CondCodeSDNode>(V)) {
30210 Operands.push_back(V);
30211 continue;
30212 }
30213
30214 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
30215 EVT VTArg = VTNode->getVT().getVectorElementType();
30216 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
30217 Operands.push_back(DAG.getValueType(NewVTArg));
30218 continue;
30219 }
30220
30221 assert(isTypeLegal(V.getValueType()) &&
30222 "Expected only legal fixed-width types");
30223 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
30224 }
30225
30226 if (isMergePassthruOpcode(NewOp))
30227 Operands.push_back(DAG.getUNDEF(ContainerVT));
30228
30229 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
30230 return convertFromScalableVector(DAG, VT, ScalableRes);
30231 }
30232
30233 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
30234
30236 for (const SDValue &V : Op->op_values()) {
30237 assert((!V.getValueType().isVector() ||
30238 V.getValueType().isScalableVector()) &&
30239 "Only scalable vectors are supported!");
30240 Operands.push_back(V);
30241 }
30242
30243 if (isMergePassthruOpcode(NewOp))
30244 Operands.push_back(DAG.getUNDEF(VT));
30245
30246 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
30247}
30248
30249// If a fixed length vector operation has no side effects when applied to
30250// undefined elements, we can safely use scalable vectors to perform the same
30251// operation without needing to worry about predication.
30252SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
30253 SelectionDAG &DAG) const {
30254 EVT VT = Op.getValueType();
30256 "Only expected to lower fixed length vector operation!");
30257 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30258
30259 // Create list of operands by converting existing ones to scalable types.
30261 for (const SDValue &V : Op->op_values()) {
30262 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
30263
30264 // Pass through non-vector operands.
30265 if (!V.getValueType().isVector()) {
30266 Ops.push_back(V);
30267 continue;
30268 }
30269
30270 // "cast" fixed length vector to a scalable vector.
30271 assert(V.getValueType().isFixedLengthVector() &&
30272 isTypeLegal(V.getValueType()) &&
30273 "Only fixed length vectors are supported!");
30274 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
30275 }
30276
30277 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
30278 return convertFromScalableVector(DAG, VT, ScalableRes);
30279}
30280
30281SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
30282 SelectionDAG &DAG) const {
30283 SDLoc DL(ScalarOp);
30284 SDValue AccOp = ScalarOp.getOperand(0);
30285 SDValue VecOp = ScalarOp.getOperand(1);
30286 EVT SrcVT = VecOp.getValueType();
30287 EVT ResVT = SrcVT.getVectorElementType();
30288
30289 EVT ContainerVT = SrcVT;
30290 if (SrcVT.isFixedLengthVector()) {
30291 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30292 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30293 }
30294
30295 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30296 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30297
30298 // Convert operands to Scalable.
30299 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
30300 DAG.getUNDEF(ContainerVT), AccOp, Zero);
30301
30302 // Perform reduction.
30303 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
30304 Pg, AccOp, VecOp);
30305
30306 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
30307}
30308
30309SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
30310 SelectionDAG &DAG) const {
30311 SDLoc DL(ReduceOp);
30312 SDValue Op = ReduceOp.getOperand(0);
30313 EVT OpVT = Op.getValueType();
30314 EVT VT = ReduceOp.getValueType();
30315
30316 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
30317 return SDValue();
30318
30319 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
30320
30321 switch (ReduceOp.getOpcode()) {
30322 default:
30323 return SDValue();
30324 case ISD::VECREDUCE_OR:
30325 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
30326 // The predicate can be 'Op' because
30327 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
30328 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
30329 else
30330 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
30331 case ISD::VECREDUCE_AND: {
30332 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
30333 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
30334 }
30335 case ISD::VECREDUCE_XOR: {
30336 SDValue ID =
30337 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
30338 if (OpVT == MVT::nxv1i1) {
30339 // Emulate a CNTP on .Q using .D and a different governing predicate.
30340 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
30341 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
30342 }
30343 SDValue Cntp =
30344 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
30345 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
30346 }
30347 }
30348
30349 return SDValue();
30350}
30351
30352SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
30353 SDValue ScalarOp,
30354 SelectionDAG &DAG) const {
30355 SDLoc DL(ScalarOp);
30356 SDValue VecOp = ScalarOp.getOperand(0);
30357 EVT SrcVT = VecOp.getValueType();
30358
30360 SrcVT,
30361 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
30362 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30363 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30364 }
30365
30366 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
30367 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
30368 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
30369 SDValue BoolVec = VecOp.getOperand(0);
30370 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
30371 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
30372 SDValue CntpOp = DAG.getNode(
30373 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
30374 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
30375 BoolVec, BoolVec);
30376 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
30377 }
30378 }
30379
30380 // UADDV always returns an i64 result.
30381 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
30382 SrcVT.getVectorElementType();
30383 EVT RdxVT = SrcVT;
30384 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
30385 RdxVT = getPackedSVEVectorVT(ResVT);
30386
30387 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30388 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
30389 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
30390 Rdx, DAG.getConstant(0, DL, MVT::i64));
30391
30392 // The VEC_REDUCE nodes expect an element size result.
30393 if (ResVT != ScalarOp.getValueType())
30394 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
30395
30396 return Res;
30397}
30398
30399SDValue
30400AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
30401 SelectionDAG &DAG) const {
30402 EVT VT = Op.getValueType();
30403 SDLoc DL(Op);
30404
30405 EVT InVT = Op.getOperand(1).getValueType();
30406 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30407 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
30408 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
30409
30410 // Convert the mask to a predicated (NOTE: We don't need to worry about
30411 // inactive lanes since VSELECT is safe when given undefined elements).
30412 EVT MaskVT = Op.getOperand(0).getValueType();
30413 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
30414 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
30416 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
30417
30418 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
30419 Mask, Op1, Op2);
30420
30421 return convertFromScalableVector(DAG, VT, ScalableRes);
30422}
30423
30424SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
30425 SDValue Op, SelectionDAG &DAG) const {
30426 SDLoc DL(Op);
30427 EVT InVT = Op.getOperand(0).getValueType();
30428 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30429
30430 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
30431 "Only expected to lower fixed length vector operation!");
30432 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
30433 "Expected integer result of the same bit length as the inputs!");
30434
30435 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30436 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
30437 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
30438
30439 EVT CmpVT = Pg.getValueType();
30440 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
30441 {Pg, Op1, Op2, Op.getOperand(2)});
30442
30443 EVT PromoteVT = ContainerVT.changeTypeToInteger();
30444 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
30445 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
30446}
30447
30448SDValue
30449AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
30450 SelectionDAG &DAG) const {
30451 SDLoc DL(Op);
30452 auto SrcOp = Op.getOperand(0);
30453 EVT VT = Op.getValueType();
30454 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30455 EVT ContainerSrcVT =
30457
30458 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
30459 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
30460 return convertFromScalableVector(DAG, VT, Op);
30461}
30462
30463SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
30464 SDValue Op, SelectionDAG &DAG) const {
30465 SDLoc DL(Op);
30466 unsigned NumOperands = Op->getNumOperands();
30467
30468 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
30469 "Unexpected number of operands in CONCAT_VECTORS");
30470
30471 auto SrcOp1 = Op.getOperand(0);
30472 auto SrcOp2 = Op.getOperand(1);
30473 EVT VT = Op.getValueType();
30474 EVT SrcVT = SrcOp1.getValueType();
30475
30476 // Match a splat of 128b segments that fit in a single register.
30477 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
30478 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30479 SDValue Splat =
30480 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
30481 convertToScalableVector(DAG, ContainerVT, SrcOp1),
30482 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
30483 return convertFromScalableVector(DAG, VT, Splat);
30484 }
30485
30486 if (NumOperands > 2) {
30488 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
30489 for (unsigned I = 0; I < NumOperands; I += 2)
30490 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
30491 Op->getOperand(I), Op->getOperand(I + 1)));
30492
30493 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
30494 }
30495
30496 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30497
30499 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
30500 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
30501
30502 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
30503
30504 return convertFromScalableVector(DAG, VT, Op);
30505}
30506
30507SDValue
30508AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
30509 SelectionDAG &DAG) const {
30510 EVT VT = Op.getValueType();
30511 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30512
30513 SDLoc DL(Op);
30514 SDValue Val = Op.getOperand(0);
30515 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30516 EVT SrcVT = Val.getValueType();
30517 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30518 EVT ExtendVT = ContainerVT.changeVectorElementType(
30519 SrcVT.getVectorElementType());
30520
30521 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30522 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
30523
30524 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
30525 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
30526 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30527 Pg, Val, DAG.getUNDEF(ContainerVT));
30528
30529 return convertFromScalableVector(DAG, VT, Val);
30530}
30531
30532SDValue
30533AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
30534 SelectionDAG &DAG) const {
30535 EVT VT = Op.getValueType();
30536 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30537
30538 SDLoc DL(Op);
30539 SDValue Val = Op.getOperand(0);
30540 EVT SrcVT = Val.getValueType();
30541 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30542 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
30544 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
30545
30546 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30547 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
30548 Op.getOperand(1), DAG.getUNDEF(RoundVT));
30549 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
30550 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30551
30552 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30553 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30554}
30555
30556SDValue
30557AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
30558 SelectionDAG &DAG) const {
30559 EVT VT = Op.getValueType();
30560 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30561
30562 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
30563 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
30564 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
30565
30566 SDLoc DL(Op);
30567 SDValue Val = Op.getOperand(0);
30568 EVT SrcVT = Val.getValueType();
30569 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30570 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30571
30572 if (VT.bitsGE(SrcVT)) {
30574
30575 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
30576 VT.changeTypeToInteger(), Val);
30577
30578 // Safe to use a larger than specified operand because by promoting the
30579 // value nothing has changed from an arithmetic point of view.
30580 Val =
30581 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
30582 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30583 DAG.getUNDEF(ContainerDstVT));
30584 return convertFromScalableVector(DAG, VT, Val);
30585 } else {
30586 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
30587 ContainerDstVT.getVectorElementType());
30589
30590 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30591 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30592 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
30593 Val = convertFromScalableVector(DAG, SrcVT, Val);
30594
30595 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30596 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30597 }
30598}
30599
30600SDValue
30601AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
30602 SelectionDAG &DAG) const {
30603 SDLoc DL(Op);
30604 EVT OpVT = Op.getValueType();
30605 assert(OpVT.isScalableVector() &&
30606 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
30607
30608 // Are multi-register uzp instructions available?
30609 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30610 OpVT.getVectorElementType() != MVT::i1) {
30611 Intrinsic::ID IntID;
30612 switch (Op->getNumOperands()) {
30613 default:
30614 return SDValue();
30615 case 2:
30616 IntID = Intrinsic::aarch64_sve_uzp_x2;
30617 break;
30618 case 4:
30619 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30620 OpVT.getScalarSizeInBits() == 64)
30621 return SDValue();
30622 IntID = Intrinsic::aarch64_sve_uzp_x4;
30623 break;
30624 }
30625
30627 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30628 Ops.append(Op->op_values().begin(), Op->op_values().end());
30629 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30630 }
30631
30632 if (Op->getNumOperands() != 2)
30633 return SDValue();
30634
30635 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
30636 Op.getOperand(1));
30637 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
30638 Op.getOperand(1));
30639 return DAG.getMergeValues({Even, Odd}, DL);
30640}
30641
30642SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
30643 SelectionDAG &DAG) const {
30644 SDLoc DL(Op);
30645 EVT OpVT = Op.getValueType();
30646 assert(OpVT.isScalableVector() &&
30647 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
30648
30649 // Are multi-register zip instructions available?
30650 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30651 OpVT.getVectorElementType() != MVT::i1) {
30652 Intrinsic::ID IntID;
30653 switch (Op->getNumOperands()) {
30654 default:
30655 return SDValue();
30656 case 2:
30657 IntID = Intrinsic::aarch64_sve_zip_x2;
30658 break;
30659 case 4:
30660 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30661 OpVT.getScalarSizeInBits() == 64)
30662 return SDValue();
30663 IntID = Intrinsic::aarch64_sve_zip_x4;
30664 break;
30665 }
30666
30668 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30669 Ops.append(Op->op_values().begin(), Op->op_values().end());
30670 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30671 }
30672
30673 if (Op->getNumOperands() != 2)
30674 return SDValue();
30675
30676 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
30677 Op.getOperand(1));
30678 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
30679 Op.getOperand(1));
30680 return DAG.getMergeValues({Lo, Hi}, DL);
30681}
30682
30683SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
30684 SelectionDAG &DAG) const {
30685 // FIXME: Maybe share some code with LowerMGather/Scatter?
30686 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
30687 SDLoc DL(HG);
30688 SDValue Chain = HG->getChain();
30689 SDValue Inc = HG->getInc();
30690 SDValue Mask = HG->getMask();
30691 SDValue Ptr = HG->getBasePtr();
30692 SDValue Index = HG->getIndex();
30693 SDValue Scale = HG->getScale();
30694 SDValue IntID = HG->getIntID();
30695
30696 // The Intrinsic ID determines the type of update operation.
30697 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
30698 // Right now, we only support 'add' as an update.
30699 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
30700 "Unexpected histogram update operation");
30701
30702 EVT IndexVT = Index.getValueType();
30703 LLVMContext &Ctx = *DAG.getContext();
30704 ElementCount EC = IndexVT.getVectorElementCount();
30705 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
30706 EVT IncExtVT =
30707 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
30708 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
30709 bool ExtTrunc = IncSplatVT != MemVT;
30710
30711 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30712 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
30713 SDValue IncSplat = DAG.getSplatVector(
30714 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
30715 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
30716
30717 MachineMemOperand *MMO = HG->getMemOperand();
30718 // Create an MMO for the gather, without load|store flags.
30719 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
30721 MMO->getAlign(), MMO->getAAInfo());
30722 ISD::MemIndexType IndexType = HG->getIndexType();
30723 SDValue Gather = DAG.getMaskedGather(
30724 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
30725 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
30726
30727 SDValue GChain = Gather.getValue(1);
30728
30729 // Perform the histcnt, multiply by inc, add to bucket data.
30730 SDValue ID =
30731 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
30732 SDValue HistCnt =
30733 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
30734 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
30735 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
30736
30737 // Create an MMO for the scatter, without load|store flags.
30738 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
30740 MMO->getAlign(), MMO->getAAInfo());
30741
30742 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
30743 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
30744 ScatterOps, SMMO, IndexType, ExtTrunc);
30745 return Scatter;
30746}
30747
30748/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
30749/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
30750/// however still make use of the dot product instruction by instead
30751/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
30752/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
30753/// the following pattern is emitted:
30754/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
30755/// NTy/2))))
30756SDValue
30757AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
30758 SelectionDAG &DAG) const {
30759 SDLoc DL(Op);
30760
30761 SDValue Acc = Op.getOperand(0);
30762 SDValue LHS = Op.getOperand(1);
30763 SDValue RHS = Op.getOperand(2);
30764 EVT ResultVT = Op.getValueType();
30765 EVT OrigResultVT = ResultVT;
30766 EVT OpVT = LHS.getValueType();
30767
30768 bool ConvertToScalable =
30769 ResultVT.isFixedLengthVector() &&
30770 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
30771
30772 if (ConvertToScalable) {
30773 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
30774 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
30775 Acc = convertToScalableVector(DAG, ResultVT, Acc);
30776 LHS = convertToScalableVector(DAG, OpVT, LHS);
30777 RHS = convertToScalableVector(DAG, OpVT, RHS);
30778 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
30779 }
30780
30781 // Two-way and four-way partial reductions are supported by patterns.
30782 // We only need to handle the 8-way partial reduction.
30783 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
30784 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
30785 : Op;
30786
30787 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
30788 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
30789 DAG.getConstant(0, DL, DotVT), LHS, RHS);
30790
30791 SDValue Res;
30792 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
30793 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
30794 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
30795 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
30796 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
30797 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
30798 } else {
30799 // Fold (nx)v4i32 into (nx)v2i64
30800 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
30801 if (IsUnsigned) {
30802 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
30803 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
30804 } else {
30805 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
30806 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
30807 }
30808 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
30809 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
30810 }
30811
30812 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
30813 : Res;
30814}
30815
30816SDValue
30817AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
30818 SelectionDAG &DAG) const {
30819 EVT VT = Op.getValueType();
30820 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30821
30822 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
30823 "Lowering fixed length get_active_lane_mask requires SVE!");
30824
30825 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
30826 // but we can use SVE when available.
30827
30828 SDLoc DL(Op);
30829 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30830 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
30831
30832 SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,
30833 Op.getOperand(0), Op.getOperand(1));
30834 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
30835 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
30836 DAG.getVectorIdxConstant(0, DL));
30837}
30838
30839SDValue
30840AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
30841 SelectionDAG &DAG) const {
30842 EVT VT = Op.getValueType();
30843 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30844
30845 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
30846 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
30847 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
30848
30849 SDLoc DL(Op);
30850 SDValue Val = Op.getOperand(0);
30851 EVT SrcVT = Val.getValueType();
30852 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30853 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30854
30855 if (VT.bitsGT(SrcVT)) {
30856 EVT CvtVT = ContainerDstVT.changeVectorElementType(
30857 ContainerSrcVT.getVectorElementType());
30859
30860 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30861 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
30862
30863 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
30864 Val = getSVESafeBitCast(CvtVT, Val, DAG);
30865 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30866 DAG.getUNDEF(ContainerDstVT));
30867 return convertFromScalableVector(DAG, VT, Val);
30868 } else {
30869 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
30871
30872 // Safe to use a larger than specified result since an fp_to_int where the
30873 // result doesn't fit into the destination is undefined.
30874 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30875 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30876 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30877
30878 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
30879 }
30880}
30881
30883 ArrayRef<int> ShuffleMask, EVT VT,
30884 EVT ContainerVT, SelectionDAG &DAG) {
30885 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30886 SDLoc DL(Op);
30887 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30888 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30889 bool IsSingleOp =
30890 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
30891
30892 if (!Subtarget.isNeonAvailable() && !MinSVESize)
30893 MinSVESize = 128;
30894
30895 // Ignore two operands if no SVE2 or all index numbers couldn't
30896 // be represented.
30897 if (!IsSingleOp && !Subtarget.hasSVE2())
30898 return SDValue();
30899
30900 EVT VTOp1 = Op.getOperand(0).getValueType();
30901 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
30902 unsigned IndexLen = MinSVESize / BitsPerElt;
30903 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
30904 uint64_t MaxOffset = maxUIntN(BitsPerElt);
30905 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
30906 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
30907 bool MinMaxEqual = (MinSVESize == MaxSVESize);
30908 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
30909 "Incorrectly legalised shuffle operation");
30910
30912 // If MinSVESize is not equal to MaxSVESize then we need to know which
30913 // TBL mask element needs adjustment.
30914 SmallVector<SDValue, 8> AddRuntimeVLMask;
30915
30916 // Bail out for 8-bits element types, because with 2048-bit SVE register
30917 // size 8 bits is only sufficient to index into the first source vector.
30918 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
30919 return SDValue();
30920
30921 for (int Index : ShuffleMask) {
30922 // Handling poison index value.
30923 if (Index < 0)
30924 Index = 0;
30925 // If the mask refers to elements in the second operand, then we have to
30926 // offset the index by the number of elements in a vector. If this is number
30927 // is not known at compile-time, we need to maintain a mask with 'VL' values
30928 // to add at runtime.
30929 if ((unsigned)Index >= ElementsPerVectorReg) {
30930 if (MinMaxEqual) {
30931 Index += IndexLen - ElementsPerVectorReg;
30932 } else {
30933 Index = Index - ElementsPerVectorReg;
30934 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
30935 }
30936 } else if (!MinMaxEqual)
30937 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30938 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
30939 // to 255, this might point to the last element of in the second operand
30940 // of the shufflevector, thus we are rejecting this transform.
30941 if ((unsigned)Index >= MaxOffset)
30942 return SDValue();
30943 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
30944 }
30945
30946 // Choosing an out-of-range index leads to the lane being zeroed vs zero
30947 // value where it would perform first lane duplication for out of
30948 // index elements. For i8 elements an out-of-range index could be a valid
30949 // for 2048-bit vector register size.
30950 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
30951 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
30952 if (!MinMaxEqual)
30953 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30954 }
30955
30956 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
30957 SDValue VecMask =
30958 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30959 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
30960
30961 SDValue Shuffle;
30962 if (IsSingleOp)
30963 Shuffle =
30964 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30965 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
30966 Op1, SVEMask);
30967 else if (Subtarget.hasSVE2()) {
30968 if (!MinMaxEqual) {
30969 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
30970 SDValue VScale = (BitsPerElt == 64)
30971 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
30972 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
30973 SDValue VecMask =
30974 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30975 SDValue MulByMask = DAG.getNode(
30976 ISD::MUL, DL, MaskType,
30977 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
30978 DAG.getBuildVector(MaskType, DL,
30979 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
30980 SDValue UpdatedVecMask =
30981 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
30982 SVEMask = convertToScalableVector(
30983 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
30984 }
30985 Shuffle =
30986 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30987 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
30988 Op1, Op2, SVEMask);
30989 }
30990 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
30991 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
30992}
30993
30994SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
30995 SDValue Op, SelectionDAG &DAG) const {
30996 EVT VT = Op.getValueType();
30997 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30998
30999 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
31000 auto ShuffleMask = SVN->getMask();
31001
31002 SDLoc DL(Op);
31003 SDValue Op1 = Op.getOperand(0);
31004 SDValue Op2 = Op.getOperand(1);
31005
31006 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31007 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
31008 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
31009
31010 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
31011 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
31012 return MVT::i32;
31013 return ScalarTy;
31014 };
31015
31016 if (SVN->isSplat()) {
31017 unsigned Lane = std::max(0, SVN->getSplatIndex());
31018 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31019 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31020 DAG.getConstant(Lane, DL, MVT::i64));
31021 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
31022 return convertFromScalableVector(DAG, VT, Op);
31023 }
31024
31025 bool ReverseEXT = false;
31026 unsigned Imm;
31027 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
31028 Imm == VT.getVectorNumElements() - 1) {
31029 if (ReverseEXT)
31030 std::swap(Op1, Op2);
31031 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31032 SDValue Scalar = DAG.getNode(
31033 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31034 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
31035 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
31036 return convertFromScalableVector(DAG, VT, Op);
31037 }
31038
31039 unsigned EltSize = VT.getScalarSizeInBits();
31040 for (unsigned BlockSize : {64U, 32U, 16U}) {
31041 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
31042 unsigned RevOp;
31043 if (EltSize == 8)
31044 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
31045 else if (EltSize == 16)
31046 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
31047 else
31048 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
31049 EVT BlockedVT =
31051 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
31052 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
31053 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
31054 DAG.getUNDEF(BlockedVT));
31055 SDValue Container =
31056 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
31057 return convertFromScalableVector(DAG, VT, Container);
31058 }
31059 }
31060
31061 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
31062 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
31063 SDValue Pg = getPredicateForVector(DAG, DL, VT);
31064 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
31065 Pg, Op1, DAG.getUNDEF(ContainerVT));
31066 return convertFromScalableVector(DAG, VT, Revd);
31067 }
31068
31069 unsigned WhichResult;
31070 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
31071 WhichResult == 0)
31073 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
31074
31075 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
31076 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31078 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
31079 }
31080
31081 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
31083 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
31084
31085 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31086 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31088 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31089 }
31090
31091 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
31092 // represents the same logical operation as performed by a ZIP instruction. In
31093 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
31094 // equivalent to an AArch64 instruction. There's the extra component of
31095 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
31096 // only operated on 64/128bit vector types that have a direct mapping to a
31097 // target register and so an exact mapping is implied.
31098 // However, when using SVE for fixed length vectors, most legal vector types
31099 // are actually sub-vectors of a larger SVE register. When mapping
31100 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
31101 // how the mask's indices translate. Specifically, when the mapping requires
31102 // an exact meaning for a specific vector index (e.g. Index X is the last
31103 // vector element in the register) then such mappings are often only safe when
31104 // the exact SVE register size is know. The main exception to this is when
31105 // indices are logically relative to the first element of either
31106 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
31107 // when converting from fixed-length to scalable vector types (i.e. the start
31108 // of a fixed length vector is always the start of a scalable vector).
31109 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
31110 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
31111 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
31112 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
31113 Op2.isUndef()) {
31114 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
31115 return convertFromScalableVector(DAG, VT, Op);
31116 }
31117
31118 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
31119 WhichResult != 0)
31121 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
31122
31123 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
31124 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31126 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
31127 }
31128
31129 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
31131 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
31132
31133 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31134 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31136 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31137 }
31138
31139 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
31140 Subtarget->isSVEorStreamingSVEAvailable()) {
31142 "Unsupported SVE vector size");
31143
31145 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
31146 if (std::optional<unsigned> Lane =
31147 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
31148 SDValue IID =
31149 DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
31151 DAG, VT,
31152 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31153 {IID, Op1,
31154 DAG.getConstant(*Lane, DL, MVT::i64,
31155 /*isTarget=*/true)}));
31156 }
31157 }
31158 }
31159
31160 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
31161 // This may allow the shuffle to be matched as something cheaper like ZIP1.
31162 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
31163 return WideOp;
31164
31165 // Avoid producing TBL instruction if we don't know SVE register minimal size,
31166 // unless NEON is not available and we can assume minimal SVE register size is
31167 // 128-bits.
31168 if (MinSVESize || !Subtarget->isNeonAvailable())
31169 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
31170 DAG);
31171
31172 return SDValue();
31173}
31174
31175SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
31176 SelectionDAG &DAG) const {
31177 SDLoc DL(Op);
31178 EVT InVT = Op.getValueType();
31179
31180 assert(VT.isScalableVector() && isTypeLegal(VT) &&
31181 InVT.isScalableVector() && isTypeLegal(InVT) &&
31182 "Only expect to cast between legal scalable vector types!");
31183 assert(VT.getVectorElementType() != MVT::i1 &&
31184 InVT.getVectorElementType() != MVT::i1 &&
31185 "For predicate bitcasts, use getSVEPredicateBitCast");
31186
31187 if (InVT == VT)
31188 return Op;
31189
31190 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
31191 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
31192
31193 // Safe bitcasting between unpacked vector types of different element counts
31194 // is currently unsupported because the following is missing the necessary
31195 // work to ensure the result's elements live where they're supposed to within
31196 // an SVE register.
31197 // 01234567
31198 // e.g. nxv2i32 = XX??XX??
31199 // nxv4f16 = X?X?X?X?
31201 VT == PackedVT || InVT == PackedInVT) &&
31202 "Unexpected bitcast!");
31203
31204 // Pack input if required.
31205 if (InVT != PackedInVT)
31206 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
31207
31208 if (Subtarget->isLittleEndian() ||
31209 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
31210 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31211 else {
31212 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
31213 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
31214
31215 // Simulate the effect of casting through memory.
31216 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
31217 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
31218 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
31219 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
31220 if (PackedVTAsInt.getScalarSizeInBits() != 8)
31221 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
31222 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31223 }
31224
31225 // Unpack result if required.
31226 if (VT != PackedVT)
31227 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
31228
31229 return Op;
31230}
31231
31233 SDValue N) const {
31234 return ::isAllActivePredicate(DAG, N);
31235}
31236
31238 return ::getPromotedVTForPredicate(VT);
31239}
31240
31241bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
31242 SDValue Op, const APInt &OriginalDemandedBits,
31243 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
31244 unsigned Depth) const {
31245
31246 unsigned Opc = Op.getOpcode();
31247 switch (Opc) {
31248 case AArch64ISD::VSHL: {
31249 // Match (VSHL (VLSHR Val X) X)
31250 SDValue ShiftL = Op;
31251 SDValue ShiftR = Op->getOperand(0);
31252 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
31253 return false;
31254
31255 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
31256 return false;
31257
31258 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
31259 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
31260
31261 // Other cases can be handled as well, but this is not
31262 // implemented.
31263 if (ShiftRBits != ShiftLBits)
31264 return false;
31265
31266 unsigned ScalarSize = Op.getScalarValueSizeInBits();
31267 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
31268
31269 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
31270 APInt UnusedBits = ~OriginalDemandedBits;
31271
31272 if ((ZeroBits & UnusedBits) != ZeroBits)
31273 return false;
31274
31275 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
31276 // used - simplify to just Val.
31277 return TLO.CombineTo(Op, ShiftR->getOperand(0));
31278 }
31279 case AArch64ISD::BICi: {
31280 // Fold BICi if all destination bits already known to be zeroed
31281 SDValue Op0 = Op.getOperand(0);
31282 KnownBits KnownOp0 =
31283 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
31284 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
31285 APInt BitsToClear =
31286 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
31287 .trunc(KnownOp0.getBitWidth());
31288 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
31289 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
31290 return TLO.CombineTo(Op, Op0);
31291
31292 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
31293 return false;
31294 }
31296 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
31297 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
31298 if (!MaxSVEVectorSizeInBits)
31299 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
31300 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
31301 // The SVE count intrinsics don't support the multiplier immediate so we
31302 // don't have to account for that here. The value returned may be slightly
31303 // over the true required bits, as this is based on the "ALL" pattern. The
31304 // other patterns are also exposed by these intrinsics, but they all
31305 // return a value that's strictly less than "ALL".
31306 unsigned RequiredBits = llvm::bit_width(MaxElements);
31307 unsigned BitWidth = Known.Zero.getBitWidth();
31308 if (RequiredBits < BitWidth)
31309 Known.Zero.setHighBits(BitWidth - RequiredBits);
31310 return false;
31311 }
31312 }
31313 }
31314
31316 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
31317}
31318
31319bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
31320 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
31321 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
31322
31323 // TODO: Add more target nodes.
31324 switch (Op.getOpcode()) {
31325 case AArch64ISD::MOVI:
31326 case AArch64ISD::MOVIedit:
31327 case AArch64ISD::MOVImsl:
31328 case AArch64ISD::MOVIshift:
31329 case AArch64ISD::MVNImsl:
31330 case AArch64ISD::MVNIshift:
31331 case AArch64ISD::VASHR:
31332 case AArch64ISD::VLSHR:
31333 case AArch64ISD::VSHL:
31334 return false;
31335 }
31337 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
31338}
31339
31340bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
31341 return Op.getOpcode() == AArch64ISD::DUP ||
31342 Op.getOpcode() == AArch64ISD::MOVI ||
31343 Op.getOpcode() == AArch64ISD::MOVIshift ||
31344 Op.getOpcode() == AArch64ISD::MOVImsl ||
31345 Op.getOpcode() == AArch64ISD::MOVIedit ||
31346 Op.getOpcode() == AArch64ISD::MVNIshift ||
31347 Op.getOpcode() == AArch64ISD::MVNImsl ||
31348 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
31349 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
31350 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
31351 (Op.getOpcode() == ISD::FNEG &&
31352 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
31353 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
31354 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
31355 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
31357}
31358
31360 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
31361 Subtarget->hasComplxNum();
31362}
31363
31366 auto *VTy = dyn_cast<VectorType>(Ty);
31367 if (!VTy)
31368 return false;
31369
31370 // If the vector is scalable, SVE is enabled, implying support for complex
31371 // numbers. Otherwise, we need to ensure complex number support is available
31372 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
31373 return false;
31374
31375 auto *ScalarTy = VTy->getScalarType();
31376 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
31377
31378 // We can only process vectors that have a bit size of 128 or higher (with an
31379 // additional 64 bits for Neon). Additionally, these vectors must have a
31380 // power-of-2 size, as we later split them into the smallest supported size
31381 // and merging them back together after applying complex operation.
31382 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
31383 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
31384 !llvm::isPowerOf2_32(VTyWidth))
31385 return false;
31386
31387 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
31388 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
31389
31391 return ScalarWidth == 32 || ScalarWidth == 64;
31392 return 8 <= ScalarWidth && ScalarWidth <= 64;
31393 }
31394
31395 // CDot is not supported outside of scalable/sve scopes
31397 return false;
31398
31399 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
31400 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
31401}
31402
31405 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
31406 Value *Accumulator) const {
31407 VectorType *Ty = cast<VectorType>(InputA->getType());
31408 if (Accumulator == nullptr)
31410 bool IsScalable = Ty->isScalableTy();
31411 bool IsInt = Ty->getElementType()->isIntegerTy();
31412
31413 unsigned TyWidth =
31414 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
31415
31416 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
31417 "Vector type must be either 64 or a power of 2 that is at least 128");
31418
31419 if (TyWidth > 128) {
31420 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
31421 int AccStride = cast<VectorType>(Accumulator->getType())
31422 ->getElementCount()
31423 .getKnownMinValue() /
31424 2;
31425 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
31426 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
31427 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
31428 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
31429 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
31430 Value *LowerSplitAcc = nullptr;
31431 Value *UpperSplitAcc = nullptr;
31432 Type *FullTy = Ty;
31433 FullTy = Accumulator->getType();
31434 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
31435 cast<VectorType>(Accumulator->getType()));
31436 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
31437 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
31438 auto *LowerSplitInt = createComplexDeinterleavingIR(
31439 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
31440 auto *UpperSplitInt = createComplexDeinterleavingIR(
31441 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
31442
31443 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
31444 LowerSplitInt, uint64_t(0));
31445 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
31446 }
31447
31448 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
31449 if (IsScalable) {
31450 if (IsInt)
31451 return B.CreateIntrinsic(
31452 Intrinsic::aarch64_sve_cmla_x, Ty,
31453 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31454
31455 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31456 return B.CreateIntrinsic(
31457 Intrinsic::aarch64_sve_fcmla, Ty,
31458 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31459 }
31460
31461 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
31462 Intrinsic::aarch64_neon_vcmla_rot90,
31463 Intrinsic::aarch64_neon_vcmla_rot180,
31464 Intrinsic::aarch64_neon_vcmla_rot270};
31465
31466
31467 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
31468 {Accumulator, InputA, InputB});
31469 }
31470
31471 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
31472 if (IsScalable) {
31475 if (IsInt)
31476 return B.CreateIntrinsic(
31477 Intrinsic::aarch64_sve_cadd_x, Ty,
31478 {InputA, InputB, B.getInt32((int)Rotation * 90)});
31479
31480 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31481 return B.CreateIntrinsic(
31482 Intrinsic::aarch64_sve_fcadd, Ty,
31483 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
31484 }
31485 return nullptr;
31486 }
31487
31490 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
31492 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
31493
31494 if (IntId == Intrinsic::not_intrinsic)
31495 return nullptr;
31496
31497 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
31498 }
31499
31500 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
31501 IsScalable) {
31502 return B.CreateIntrinsic(
31503 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
31504 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31505 }
31506
31507 return nullptr;
31508}
31509
31510bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
31511 unsigned Opc = N->getOpcode();
31512 if (ISD::isExtOpcode(Opc)) {
31513 if (any_of(N->users(),
31514 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
31515 return false;
31516 }
31517 return true;
31518}
31519
31521 return Subtarget->getMinimumJumpTableEntries();
31522}
31523
31525 CallingConv::ID CC,
31526 EVT VT) const {
31527 bool NonUnitFixedLengthVector =
31529 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31530 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
31531
31532 EVT VT1;
31533 MVT RegisterVT;
31534 unsigned NumIntermediates;
31535 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
31536 RegisterVT);
31537 return RegisterVT;
31538}
31539
31541 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
31542 bool NonUnitFixedLengthVector =
31544 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31545 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
31546
31547 EVT VT1;
31548 MVT VT2;
31549 unsigned NumIntermediates;
31550 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
31551 NumIntermediates, VT2);
31552}
31553
31555 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
31556 unsigned &NumIntermediates, MVT &RegisterVT) const {
31558 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
31559 if (!RegisterVT.isFixedLengthVector() ||
31560 RegisterVT.getFixedSizeInBits() <= 128)
31561 return NumRegs;
31562
31563 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
31564 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
31565 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
31566
31567 // A size mismatch here implies either type promotion or widening and would
31568 // have resulted in scalarisation if larger vectors had not be available.
31569 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
31570 EVT EltTy = VT.getVectorElementType();
31571 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
31572 if (!isTypeLegal(NewVT))
31573 NewVT = EltTy;
31574
31575 IntermediateVT = NewVT;
31576 NumIntermediates = VT.getVectorNumElements();
31577 RegisterVT = getRegisterType(Context, NewVT);
31578 return NumIntermediates;
31579 }
31580
31581 // SVE VLS support does not introduce a new ABI so we should use NEON sized
31582 // types for vector arguments and returns.
31583
31584 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
31585 NumIntermediates *= NumSubRegs;
31586 NumRegs *= NumSubRegs;
31587
31588 switch (RegisterVT.getVectorElementType().SimpleTy) {
31589 default:
31590 llvm_unreachable("unexpected element type for vector");
31591 case MVT::i8:
31592 IntermediateVT = RegisterVT = MVT::v16i8;
31593 break;
31594 case MVT::i16:
31595 IntermediateVT = RegisterVT = MVT::v8i16;
31596 break;
31597 case MVT::i32:
31598 IntermediateVT = RegisterVT = MVT::v4i32;
31599 break;
31600 case MVT::i64:
31601 IntermediateVT = RegisterVT = MVT::v2i64;
31602 break;
31603 case MVT::f16:
31604 IntermediateVT = RegisterVT = MVT::v8f16;
31605 break;
31606 case MVT::f32:
31607 IntermediateVT = RegisterVT = MVT::v4f32;
31608 break;
31609 case MVT::f64:
31610 IntermediateVT = RegisterVT = MVT::v2f64;
31611 break;
31612 case MVT::bf16:
31613 IntermediateVT = RegisterVT = MVT::v8bf16;
31614 break;
31615 }
31616
31617 return NumRegs;
31618}
31619
31621 const MachineFunction &MF) const {
31622 return !Subtarget->isTargetWindows() &&
31623 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
31624}
31625
31627 switch (Opc) {
31631 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
31632 return true;
31633 }
31634
31636}
31637
31639 EVT VT) const {
31640 return Subtarget->hasCPA() && UseFEATCPACodegen;
31641}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue performPTestFirstCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, const AArch64TargetLowering &TLI, const AArch64RegisterInfo &TRI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static bool isLane0KnownActive(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
@ Default
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
mir Rename Register Operands
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1166
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
an instruction to allocate memory on the stack
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ And
*p = old & v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:935
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:321
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
arg_iterator arg_end()
Definition Function.h:875
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
const Argument * const_arg_iterator
Definition Function.h:73
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1939
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2254
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2783
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool hasScalableStackID(int ObjectIdx) const
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:56
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:712
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasStreamingInterfaceOrBody() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:686
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:299
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
@ HalfTyID
16-bit floating point type
Definition Type.h:56
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:283
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ LOOP_DEPENDENCE_RAW_MASK
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:855
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:622
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:682
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:627
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:648
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:611
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:853
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:857
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
@ LOOP_DEPENDENCE_WAR_MASK
Set rounding mode.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
const unsigned VectorBits
Definition SystemZ.h:154
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
bool isPackedVectorType(EVT SomeVT)
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2122
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2110
static const MachineMemOperand::Flags MOStridedAccess
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:207
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64