LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
60#include "llvm/IR/Attributes.h"
61#include "llvm/IR/Constants.h"
62#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/IR/Function.h"
67#include "llvm/IR/GlobalValue.h"
68#include "llvm/IR/IRBuilder.h"
69#include "llvm/IR/Instruction.h"
72#include "llvm/IR/Intrinsics.h"
73#include "llvm/IR/IntrinsicsAArch64.h"
74#include "llvm/IR/Module.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
298 case ISD::GET_ACTIVE_LANE_MASK:
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
405 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add legal sve data types
449 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
451 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
452 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
453
454 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
456 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
457 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
460
461 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
463 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
464
465 if (Subtarget->useSVEForFixedLengthVectors()) {
468 addRegisterClass(VT, &AArch64::ZPRRegClass);
469
472 addRegisterClass(VT, &AArch64::ZPRRegClass);
473 }
474 }
475
476 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
477 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
478 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
479 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
480
481 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
482 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
483 }
484
485 // Compute derived properties from the register classes
486 computeRegisterProperties(Subtarget->getRegisterInfo());
487
488 // Provide all sorts of operation actions
506 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
507 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
508 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
509 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
510 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
511 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
514 if (Subtarget->hasFPARMv8()) {
517 }
526 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
528 setOperationAction(ISD::BRIND, MVT::Other, Custom);
530
532
536
540
542
543 // Custom lowering hooks are needed for XOR
544 // to fold it into CSINC/CSINV.
547
548 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
549 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
550
551 // Virtually no operation on f128 is legal, but LLVM can't expand them when
552 // there's a valid register class, so we need custom operations in most cases.
553 setOperationAction(ISD::FABS, MVT::f128, Expand);
556 setOperationAction(ISD::FCOS, MVT::f128, Expand);
560 setOperationAction(ISD::FNEG, MVT::f128, Expand);
561 setOperationAction(ISD::FPOW, MVT::f128, Expand);
563 setOperationAction(ISD::FRINT, MVT::f128, Expand);
564 setOperationAction(ISD::FSIN, MVT::f128, Expand);
565 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
566 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
568 setOperationAction(ISD::FTAN, MVT::f128, Expand);
569 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
573 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
576 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
577 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
578 // aren't handled.
579
580 // Lowering for many of the conversions is actually specified by the non-f128
581 // type. The LowerXXX function will be trivial when f128 isn't involved.
606 if (Subtarget->hasFPARMv8()) {
609 }
612 if (Subtarget->hasFPARMv8()) {
615 }
618
623
624 // Variable arguments.
625 setOperationAction(ISD::VASTART, MVT::Other, Custom);
626 setOperationAction(ISD::VAARG, MVT::Other, Custom);
627 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
628 setOperationAction(ISD::VAEND, MVT::Other, Expand);
629
630 // Variable-sized objects.
631 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
632 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
633
634 // Lowering Funnel Shifts to EXTR
639
640 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
641
642 // Constant pool entries
644
645 // BlockAddress
647
648 // AArch64 lacks both left-rotate and popcount instructions.
654 }
655
656 // AArch64 doesn't have i32 MULH{S|U}.
659
660 // AArch64 doesn't have {U|S}MUL_LOHI.
665
666 if (Subtarget->hasCSSC()) {
670
672
676
679
684
689 } else {
693
696
699 }
700
706 }
713
714 // Custom lower Add/Sub/Mul with overflow.
727
736
737 setOperationAction(ISD::FSIN, MVT::f32, Expand);
738 setOperationAction(ISD::FSIN, MVT::f64, Expand);
739 setOperationAction(ISD::FCOS, MVT::f32, Expand);
740 setOperationAction(ISD::FCOS, MVT::f64, Expand);
741 setOperationAction(ISD::FPOW, MVT::f32, Expand);
742 setOperationAction(ISD::FPOW, MVT::f64, Expand);
745 if (Subtarget->hasFullFP16()) {
748 } else {
751 }
752
753 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
754 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
755 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
756 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
757 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
758 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
759 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
767 setOperationAction(Op, MVT::f16, Promote);
768 setOperationAction(Op, MVT::v4f16, Expand);
769 setOperationAction(Op, MVT::v8f16, Expand);
770 setOperationAction(Op, MVT::bf16, Promote);
771 setOperationAction(Op, MVT::v4bf16, Expand);
772 setOperationAction(Op, MVT::v8bf16, Expand);
773 }
774
775 // Legalize fcanonicalize to circumvent default expansion
776 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
777 if (Subtarget->hasFullFP16()) {
779 }
780
781 // fpextend from f16 or bf16 to f32 is legal
782 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
783 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
786 // fpextend from bf16 to f64 needs to be split into two fpextends
787 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
789
790 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
791 for (auto Op : {
794 ISD::BR_CC,
795 ISD::FADD,
796 ISD::FSUB,
797 ISD::FMUL,
798 ISD::FDIV,
799 ISD::FMA,
800 ISD::FCEIL,
801 ISD::FSQRT,
802 ISD::FFLOOR,
803 ISD::FNEARBYINT,
804 ISD::FRINT,
805 ISD::FROUND,
806 ISD::FROUNDEVEN,
807 ISD::FTRUNC,
808 ISD::FMINNUM,
809 ISD::FMAXNUM,
810 ISD::FMINIMUM,
811 ISD::FMAXIMUM,
812 ISD::FMINIMUMNUM,
813 ISD::FMAXIMUMNUM,
832 })
833 setOperationAction(Op, ScalarVT, Promote);
834
835 for (auto Op : {ISD::FNEG, ISD::FABS})
836 setOperationAction(Op, ScalarVT, Legal);
837
838 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
839 // because the result type is integer.
840 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
843 setOperationAction(Op, ScalarVT, Custom);
844
845 // promote v4f16 to v4f32 when that is known to be safe.
846 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
847 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
854 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
855 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
856 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
857 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
858 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
859 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
860
861 setOperationAction(ISD::FABS, V4Narrow, Legal);
862 setOperationAction(ISD::FNEG, V4Narrow, Legal);
864 setOperationAction(ISD::BR_CC, V4Narrow, Expand);
868 setOperationAction(ISD::FSQRT, V4Narrow, Expand);
869
870 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
871 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
872 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
873
874 setOperationAction(ISD::FABS, V8Narrow, Legal);
876 setOperationAction(ISD::FCEIL, V8Narrow, Legal);
879 setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
882 setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
883 setOperationAction(ISD::FNEG, V8Narrow, Legal);
884 setOperationAction(ISD::FROUND, V8Narrow, Legal);
885 setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
886 setOperationAction(ISD::FRINT, V8Narrow, Legal);
887 setOperationAction(ISD::FSQRT, V8Narrow, Expand);
889 setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
890 setOperationAction(ISD::BR_CC, V8Narrow, Expand);
893 setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
894 };
895
896 if (!Subtarget->hasFullFP16()) {
897 LegalizeNarrowFP(MVT::f16);
898 }
899 LegalizeNarrowFP(MVT::bf16);
902
903 // AArch64 has implementations of a lot of rounding-like FP operations.
904 // clang-format off
905 for (auto Op :
906 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
907 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
908 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
909 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
910 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
911 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
917 for (MVT Ty : {MVT::f32, MVT::f64})
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, MVT::f16, Legal);
921 }
922 // clang-format on
923
924 // Basic strict FP operations are legal
927 for (MVT Ty : {MVT::f32, MVT::f64})
929 if (Subtarget->hasFullFP16())
930 setOperationAction(Op, MVT::f16, Legal);
931 }
932
933 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
934
936 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
937 setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
938 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
939 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
940
941 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
942 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
943 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
944 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
945 } else {
946 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
947 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
948 }
949 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
950 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
951
952 // Generate outline atomics library calls only if LSE was not specified for
953 // subtarget
954 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
955 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
956 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
957 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
958 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
959 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
960 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
961 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
962 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
963 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
964 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
965 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
966 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
967 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
968 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
969 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
970 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
971 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
972 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
973 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
974 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
975 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
976 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
977 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
978 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
979 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
980 }
981
982 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
983 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);
984 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);
985 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);
986 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);
987
988 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);
989 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);
990 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);
991 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);
992
993 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);
994 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);
995 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);
996 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);
997
998 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);
999 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);
1000 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);
1001 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);
1002
1003 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);
1004 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);
1005 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);
1006 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);
1007 }
1008
1009 if (Subtarget->hasLSE128()) {
1010 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1011 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1012 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
1013 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
1014 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
1015 }
1016
1017 // 128-bit loads and stores can be done without expanding
1018 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1019 setOperationAction(ISD::STORE, MVT::i128, Custom);
1020
1021 // Aligned 128-bit loads and stores are single-copy atomic according to the
1022 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1023 if (Subtarget->hasLSE2()) {
1024 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1025 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1026 }
1027
1028 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1029 // custom lowering, as there are no un-paired non-temporal stores and
1030 // legalization will break up 256 bit inputs.
1031 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1035 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1036 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1037 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1038 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1039
1040 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1041 // custom lowering, as there are no un-paired non-temporal loads legalization
1042 // will break up 256 bit inputs.
1043 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1045 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1046 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1047 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1048 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1049 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1050 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1051
1052 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1053 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1054
1055 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057 // Issue __sincos_stret if available.
1058 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1059 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1060 } else {
1061 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1062 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1063 }
1064
1065 // Make floating-point constants legal for the large code model, so they don't
1066 // become loads from the constant pool.
1067 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1070 }
1071
1072 // AArch64 does not have floating-point extending loads, i1 sign-extending
1073 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1074 for (MVT VT : MVT::fp_valuetypes()) {
1075 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1076 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1077 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1078 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1079 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1080 }
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1083
1084 for (MVT WideVT : MVT::fp_valuetypes()) {
1085 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1086 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1087 setTruncStoreAction(WideVT, NarrowVT, Expand);
1088 }
1089 }
1090 }
1091
1092 if (Subtarget->hasFPARMv8()) {
1093 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1094 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1095 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1096 }
1097
1098 // Indexed loads and stores are supported.
1099 for (unsigned im = (unsigned)ISD::PRE_INC;
1101 setIndexedLoadAction(im, MVT::i8, Legal);
1102 setIndexedLoadAction(im, MVT::i16, Legal);
1103 setIndexedLoadAction(im, MVT::i32, Legal);
1104 setIndexedLoadAction(im, MVT::i64, Legal);
1105 setIndexedLoadAction(im, MVT::f64, Legal);
1106 setIndexedLoadAction(im, MVT::f32, Legal);
1107 setIndexedLoadAction(im, MVT::f16, Legal);
1108 setIndexedLoadAction(im, MVT::bf16, Legal);
1109 setIndexedStoreAction(im, MVT::i8, Legal);
1110 setIndexedStoreAction(im, MVT::i16, Legal);
1111 setIndexedStoreAction(im, MVT::i32, Legal);
1112 setIndexedStoreAction(im, MVT::i64, Legal);
1113 setIndexedStoreAction(im, MVT::f64, Legal);
1114 setIndexedStoreAction(im, MVT::f32, Legal);
1115 setIndexedStoreAction(im, MVT::f16, Legal);
1116 setIndexedStoreAction(im, MVT::bf16, Legal);
1117 }
1118
1119 // Trap.
1120 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1121 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1122 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1123
1124 // We combine OR nodes for ccmp operations.
1126 // Try to create BICs for vector ANDs.
1128
1129 // llvm.init.trampoline and llvm.adjust.trampoline
1130 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1131 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1132
1133 // Vector add and sub nodes may conceal a high-half opportunity.
1134 // Also, try to fold ADD into CSINC/CSINV..
1137
1140
1141 // Try and combine setcc with csel
1143
1145
1149 ISD::STORE, ISD::BUILD_VECTOR});
1152 setTargetDAGCombine(ISD::LOAD);
1153
1154 setTargetDAGCombine(ISD::MSTORE);
1155
1157
1159
1162 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1163
1165 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1166
1167 setTargetDAGCombine(ISD::FP_EXTEND);
1168
1170
1172
1173 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1174
1175 setTargetDAGCombine(ISD::VECREDUCE_AND);
1176 setTargetDAGCombine(ISD::VECREDUCE_OR);
1177 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1178
1180
1183
1184 // In case of strict alignment, avoid an excessive number of byte wide stores.
1187 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1188
1192 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1193
1196 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1197
1200 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1201
1203
1205
1206 EnableExtLdPromotion = true;
1207
1208 // Set required alignment.
1210 // Set preferred alignments.
1211
1212 // Don't align loops on Windows. The SEH unwind info generation needs to
1213 // know the exact length of functions before the alignments have been
1214 // expanded.
1215 if (!Subtarget->isTargetWindows())
1219
1220 // Only change the limit for entries in a jump table if specified by
1221 // the sub target, but not at the command line.
1222 unsigned MaxJT = STI.getMaximumJumpTableSize();
1223 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1225
1227
1229
1231 if (Subtarget->hasSME())
1233
1234 if (Subtarget->isNeonAvailable()) {
1235 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1236 // silliness like this:
1237 // clang-format off
1238 for (auto Op :
1239 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1240 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1242 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1243 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1244 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1245 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1246 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1247 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1248 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1249 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1250 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1251 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1252 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1259 setOperationAction(Op, MVT::v1f64, Expand);
1260 // clang-format on
1261
1262 for (auto Op :
1267 setOperationAction(Op, MVT::v1i64, Expand);
1268
1269 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1270 // elements smaller than i32, so promote the input to i32 first.
1271 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1272 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1273
1274 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1275 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1276 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1279 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1281
1282 if (Subtarget->hasFullFP16()) {
1285
1294 } else {
1295 // when AArch64 doesn't have fullfp16 support, promote the input
1296 // to i32 first.
1297 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1298 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1299 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1300 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1301 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1302 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1303 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1304 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1305 }
1306
1307 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1308 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1315 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1320 }
1321
1322 // Custom handling for some quad-vector types to detect MULL.
1323 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1324 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1325 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1326 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1327 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1328 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1329
1330 // Saturates
1331 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1332 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1337 }
1338
1339 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1340 MVT::v4i32}) {
1347 }
1348
1349 // Vector reductions
1350 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1351 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1352 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1353 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1354 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1355 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1356 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1357
1358 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1359 }
1360 }
1361 if (Subtarget->hasFullFP16())
1362 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
1363
1364 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1365 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1366 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1367 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1368 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1369 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1370 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1371 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1372 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1373 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1374 }
1375 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1376 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1377 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1378 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1379
1381 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1382 // Likewise, narrowing and extending vector loads/stores aren't handled
1383 // directly.
1386
1387 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1390 } else {
1393 }
1396
1399
1400 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1401 setTruncStoreAction(VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1403 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1404 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1405 }
1406 }
1407
1408 for (auto Op :
1409 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1410 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1414 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1416 if (Subtarget->hasFullFP16())
1417 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1419 }
1420
1421 // LRINT and LLRINT.
1422 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1423 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1425 if (Subtarget->hasFullFP16())
1426 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1428 }
1429
1430 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1431
1432 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1433 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1434 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1435 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1436
1437 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1438 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1439 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1440
1441 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1442 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1443 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1444 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1445 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1446 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1447
1448 // ADDP custom lowering
1449 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1451 // FADDP custom lowering
1452 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1454
1455 if (Subtarget->hasDotProd()) {
1456 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457 ISD::PARTIAL_REDUCE_UMLA};
1458
1459 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1461 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1462
1463 if (Subtarget->hasMatMulInt8()) {
1464 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,
1465 MVT::v16i8, Legal);
1466 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,
1467 MVT::v16i8, Custom);
1468
1469 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,
1470 MVT::v8i8, Legal);
1471 }
1472 }
1473
1474 } else /* !isNeonAvailable */ {
1476 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1478
1479 if (VT.is128BitVector() || VT.is64BitVector()) {
1480 setOperationAction(ISD::LOAD, VT, Legal);
1481 setOperationAction(ISD::STORE, VT, Legal);
1482 setOperationAction(ISD::BITCAST, VT,
1483 Subtarget->isLittleEndian() ? Legal : Expand);
1484 }
1485 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1486 setTruncStoreAction(VT, InnerVT, Expand);
1487 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1489 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1490 }
1491 }
1492 }
1493
1494 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1498 }
1499
1500 if (Subtarget->hasSME()) {
1502 }
1503
1504 // FIXME: Move lowering for more nodes here if those are common between
1505 // SVE and SME.
1506 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1507 for (auto VT :
1508 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1513 }
1514 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1515 setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
1516 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
1517 }
1518
1519 if (Subtarget->hasSVE2p1() ||
1520 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1521 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);
1522
1523 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1524 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);
1525 }
1526
1527 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1528 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1539 setOperationAction(ISD::MLOAD, VT, Custom);
1559 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1560 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1561 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1562 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1563 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1564 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1565 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1566 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1569
1575
1584
1589
1590 if (!Subtarget->isLittleEndian())
1591 setOperationAction(ISD::BITCAST, VT, Custom);
1592
1593 if (Subtarget->hasSVE2() ||
1594 (Subtarget->hasSME() && Subtarget->isStreaming()))
1595 // For SLI/SRI.
1597 }
1598
1599 // Illegal unpacked integer vector types.
1600 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1603 }
1604
1605 // Type legalize unpacked bitcasts.
1606 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1607 setOperationAction(ISD::BITCAST, VT, Custom);
1608
1609 for (auto VT :
1610 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1611 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1613
1614 for (auto VT :
1615 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1620 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1621 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1622 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1623
1627
1628 // There are no legal MVT::nxv16f## based types.
1629 if (VT != MVT::nxv16i1) {
1634 }
1635 }
1636
1637 // NEON doesn't support masked loads/stores, but SME and SVE do.
1638 for (auto VT :
1639 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1640 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1641 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1642 setOperationAction(ISD::MLOAD, VT, Custom);
1643 setOperationAction(ISD::MSTORE, VT, Custom);
1644 }
1645
1646 // Firstly, exclude all scalable vector extending loads/truncating stores,
1647 // include both integer and floating scalable vector.
1649 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1650 setTruncStoreAction(VT, InnerVT, Expand);
1651 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1653 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1654 }
1655 }
1656
1657 // Then, selectively enable those which we directly support.
1658 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1660 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1661 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1662 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1663 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1664 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1665 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1667 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1668 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1669 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1670 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1671 }
1672
1673 // SVE supports truncating stores of 64 and 128-bit vectors
1674 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1676 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1677 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1678 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1679
1680 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1681 MVT::nxv4f32, MVT::nxv2f64}) {
1682 setOperationAction(ISD::BITCAST, VT, Custom);
1685 setOperationAction(ISD::MLOAD, VT, Custom);
1693 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1694 setOperationAction(ISD::FMAXNUM, VT, Custom);
1695 setOperationAction(ISD::FMINIMUM, VT, Custom);
1696 setOperationAction(ISD::FMINNUM, VT, Custom);
1698 setOperationAction(ISD::FNEG, VT, Custom);
1700 setOperationAction(ISD::FCEIL, VT, Custom);
1701 setOperationAction(ISD::FFLOOR, VT, Custom);
1702 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1703 setOperationAction(ISD::FRINT, VT, Custom);
1704 setOperationAction(ISD::LRINT, VT, Custom);
1705 setOperationAction(ISD::LLRINT, VT, Custom);
1706 setOperationAction(ISD::FROUND, VT, Custom);
1707 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1708 setOperationAction(ISD::FTRUNC, VT, Custom);
1709 setOperationAction(ISD::FSQRT, VT, Custom);
1710 setOperationAction(ISD::FABS, VT, Custom);
1711 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1713 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1714 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1715 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1716 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1717 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1721
1724 setOperationAction(ISD::FPOW, VT, Expand);
1725 setOperationAction(ISD::FPOWI, VT, Expand);
1726 setOperationAction(ISD::FCOS, VT, Expand);
1727 setOperationAction(ISD::FSIN, VT, Expand);
1728 setOperationAction(ISD::FSINCOS, VT, Expand);
1729 setOperationAction(ISD::FTAN, VT, Expand);
1730 setOperationAction(ISD::FACOS, VT, Expand);
1731 setOperationAction(ISD::FASIN, VT, Expand);
1732 setOperationAction(ISD::FATAN, VT, Expand);
1733 setOperationAction(ISD::FATAN2, VT, Expand);
1734 setOperationAction(ISD::FCOSH, VT, Expand);
1735 setOperationAction(ISD::FSINH, VT, Expand);
1736 setOperationAction(ISD::FTANH, VT, Expand);
1737 setOperationAction(ISD::FEXP, VT, Expand);
1738 setOperationAction(ISD::FEXP2, VT, Expand);
1739 setOperationAction(ISD::FEXP10, VT, Expand);
1740 setOperationAction(ISD::FLOG, VT, Expand);
1741 setOperationAction(ISD::FLOG2, VT, Expand);
1742 setOperationAction(ISD::FLOG10, VT, Expand);
1743
1755 }
1756
1757 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1758 setOperationAction(ISD::BITCAST, VT, Custom);
1760 setOperationAction(ISD::FABS, VT, Custom);
1762 setOperationAction(ISD::FNEG, VT, Custom);
1763 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1765 setOperationAction(ISD::MLOAD, VT, Custom);
1773
1774 if (Subtarget->hasSVEB16B16() &&
1775 Subtarget->isNonStreamingSVEorSME2Available()) {
1778 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1779 setOperationAction(ISD::FMAXNUM, VT, Custom);
1780 setOperationAction(ISD::FMINIMUM, VT, Custom);
1781 setOperationAction(ISD::FMINNUM, VT, Custom);
1784 }
1785 }
1786
1787 for (auto Opcode :
1788 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1789 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1790 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1791 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1792 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1793 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1794 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1795 }
1796
1797 if (!Subtarget->hasSVEB16B16() ||
1798 !Subtarget->isNonStreamingSVEorSME2Available()) {
1799 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1800 ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1801 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1802 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1803 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1804 }
1805 }
1806
1809
1810 // NEON doesn't support integer divides, but SVE does
1811 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1812 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1815 }
1816
1817 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1818 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1819 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1820
1821 // NOTE: Currently this has to happen after computeRegisterProperties rather
1822 // than the preferred option of combining it with the addRegisterClass call.
1823 if (Subtarget->useSVEForFixedLengthVectors()) {
1826 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1827 addTypeForFixedLengthSVE(VT);
1828 }
1831 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1832 addTypeForFixedLengthSVE(VT);
1833 }
1834
1835 // 64bit results can mean a bigger than NEON input.
1836 for (auto VT : {MVT::v8i8, MVT::v4i16})
1839
1840 // 128bit results imply a bigger than NEON input.
1841 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1843 for (auto VT : {MVT::v8f16, MVT::v4f32})
1845
1846 // These operations are not supported on NEON but SVE can do them.
1848 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1849 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1850 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1851 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1852 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1853 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1854 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1855 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1856 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1857 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1858 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1859 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1860 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1861 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1862 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1863 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1864 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1865 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1866 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1867
1868 // Int operations with no NEON support.
1869 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1870 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1873 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1874 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1875 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1878 }
1879
1880 // Use SVE for vectors with more than 2 elements.
1881 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1882 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1883 }
1884
1885 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1888 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1889
1890 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1891
1892 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1894 }
1895
1896 // Handle partial reduction operations
1897 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1898 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1899 // Other pairs will default to 'Expand'.
1900 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1901 ISD::PARTIAL_REDUCE_UMLA};
1902 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1903 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1904
1905 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1906
1907 if (Subtarget->hasMatMulInt8()) {
1908 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,
1909 MVT::nxv16i8, Legal);
1910 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,
1911 MVT::nxv16i8, Custom);
1912 }
1913
1914 // Wide add types
1915 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1916 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1918 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1919 }
1920 }
1921
1922 // Handle non-aliasing elements mask
1923 if (Subtarget->hasSVE2() ||
1924 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1925 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1926 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1929 }
1930 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1933 }
1934 }
1935
1936 // Handle operations that are only available in non-streaming SVE mode.
1937 if (Subtarget->isSVEAvailable()) {
1938 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1939 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1940 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1941 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1942 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1943 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1944 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1945 setOperationAction(ISD::MGATHER, VT, Custom);
1946 setOperationAction(ISD::MSCATTER, VT, Custom);
1947 }
1948
1949 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1950 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1951 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1952 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1953
1954 // We can lower types that have <vscale x {2|4}> elements to compact.
1955 for (auto VT :
1956 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1957 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1959
1960 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1961 // NEON vectors in the lowest bits of the SVE register.
1962 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1963 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1965
1966 // Histcnt is SVE2 only
1967 if (Subtarget->hasSVE2()) {
1968 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
1969 Custom);
1970 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
1971 Custom);
1972
1973 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1974 ISD::PARTIAL_REDUCE_UMLA};
1975 // Must be lowered to SVE instructions.
1976 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1977 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1978 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1979 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1980 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1981 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1982 }
1983 }
1984
1985 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1986 // Only required for llvm.aarch64.mops.memset.tag
1988 }
1989
1991
1992 if (Subtarget->hasSVE()) {
1993 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1994 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1995 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1996 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1997 }
1998
1999 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2000
2001 IsStrictFPEnabled = true;
2003
2004 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2005 // it, but it's just a wrapper around ldexp.
2006 if (Subtarget->isTargetWindows()) {
2007 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2008 if (isOperationExpand(Op, MVT::f32))
2009 setOperationAction(Op, MVT::f32, Promote);
2010 }
2011
2012 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2013 // isn't legal.
2014 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2015 if (isOperationExpand(Op, MVT::f16))
2016 setOperationAction(Op, MVT::f16, Promote);
2017}
2018
2020 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2021}
2022
2023void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2024 assert(VT.isVector() && "VT should be a vector type");
2025
2026 if (VT.isFloatingPoint()) {
2028 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2029 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2030 }
2031
2032 // Mark vector float intrinsics as expand.
2033 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2034 setOperationAction(ISD::FSIN, VT, Expand);
2035 setOperationAction(ISD::FCOS, VT, Expand);
2036 setOperationAction(ISD::FTAN, VT, Expand);
2037 setOperationAction(ISD::FASIN, VT, Expand);
2038 setOperationAction(ISD::FACOS, VT, Expand);
2039 setOperationAction(ISD::FATAN, VT, Expand);
2040 setOperationAction(ISD::FATAN2, VT, Expand);
2041 setOperationAction(ISD::FSINH, VT, Expand);
2042 setOperationAction(ISD::FCOSH, VT, Expand);
2043 setOperationAction(ISD::FTANH, VT, Expand);
2044 setOperationAction(ISD::FPOW, VT, Expand);
2045 setOperationAction(ISD::FLOG, VT, Expand);
2046 setOperationAction(ISD::FLOG2, VT, Expand);
2047 setOperationAction(ISD::FLOG10, VT, Expand);
2048 setOperationAction(ISD::FEXP, VT, Expand);
2049 setOperationAction(ISD::FEXP2, VT, Expand);
2050 setOperationAction(ISD::FEXP10, VT, Expand);
2051 }
2052
2053 // But we do support custom-lowering for FCOPYSIGN.
2054 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2055 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2056 VT == MVT::v8f16) &&
2057 Subtarget->hasFullFP16()))
2059
2072
2076 for (MVT InnerVT : MVT::all_valuetypes())
2077 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2078
2079 // CNT supports only B element sizes, then use UADDLP to widen.
2080 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2082
2088
2089 for (unsigned Opcode :
2092 setOperationAction(Opcode, VT, Custom);
2093
2094 if (!VT.isFloatingPoint())
2096
2097 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2098 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2099 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2100 setOperationAction(Opcode, VT, Legal);
2101
2102 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2103 // NEON types.
2104 if (VT.isFloatingPoint() &&
2105 VT.getVectorElementType() != MVT::bf16 &&
2106 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2107 for (unsigned Opcode :
2108 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2109 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2113 setOperationAction(Opcode, VT, Legal);
2114
2115 // Strict fp extend and trunc are legal
2116 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2118 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2120
2121 // FIXME: We could potentially make use of the vector comparison instructions
2122 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2123 // complications:
2124 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2125 // so we would need to expand when the condition code doesn't match the
2126 // kind of comparison.
2127 // * Some kinds of comparison require more than one FCMXY instruction so
2128 // would need to be expanded instead.
2129 // * The lowering of the non-strict versions involves target-specific ISD
2130 // nodes so we would likely need to add strict versions of all of them and
2131 // handle them appropriately.
2134
2135 // When little-endian we can use ordinary d and q register loads/stores for
2136 // vector types, but when big-endian we need to use structure load/store which
2137 // only allow post-index addressing.
2138 if (Subtarget->isLittleEndian()) {
2139 for (unsigned im = (unsigned)ISD::PRE_INC;
2140 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2143 }
2144 } else {
2147 }
2148
2149 if (Subtarget->hasD128()) {
2152 }
2153
2154 if (VT.isInteger()) {
2155 // Let common code emit inverted variants of compares we do support.
2161 }
2162}
2163
2165 EVT OpVT) const {
2166 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2167 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2168 ResVT.getVectorElementType() != MVT::i1)
2169 return true;
2170
2171 // Only support illegal types if the result is scalable and min elements > 1.
2172 if (ResVT.getVectorMinNumElements() == 1 ||
2173 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2174 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2175 return true;
2176
2177 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2178 // but anything larger should be expanded.
2179 if (OpVT.getFixedSizeInBits() > 64)
2180 return true;
2181
2182 return false;
2183}
2184
2186 const IntrinsicInst *I) const {
2187 assert(I->getIntrinsicID() == Intrinsic::vector_partial_reduce_add &&
2188 "Unexpected intrinsic!");
2189 return true;
2190}
2191
2193 if (!Subtarget->isSVEorStreamingSVEAvailable())
2194 return true;
2195
2196 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2197 // also support fixed-width predicates.
2198 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2199 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2200 VT != MVT::v4i1 && VT != MVT::v2i1;
2201}
2202
2204 unsigned SearchSize) const {
2205 // MATCH is SVE2 and only available in non-streaming mode.
2206 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2207 return true;
2208 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2209 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2210 return SearchSize != 8;
2211 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2212 return SearchSize != 8 && SearchSize != 16;
2213 return true;
2214}
2215
2216void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2217 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2218
2219 // By default everything must be expanded.
2220 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2222
2223 if (VT.isFloatingPoint()) {
2233 }
2234
2236 VT == MVT::v1f64 ? Expand : Custom;
2237
2238 // Mark integer truncating stores/extending loads as having custom lowering
2239 if (VT.isInteger()) {
2240 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2241 while (InnerVT != VT) {
2242 setTruncStoreAction(VT, InnerVT, Default);
2243 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2244 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2245 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2246 InnerVT = InnerVT.changeVectorElementType(
2247 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2248 }
2249 }
2250
2251 // Mark floating-point truncating stores/extending loads as having custom
2252 // lowering
2253 if (VT.isFloatingPoint()) {
2254 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2255 while (InnerVT != VT) {
2256 setTruncStoreAction(VT, InnerVT, Custom);
2257 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2258 InnerVT = InnerVT.changeVectorElementType(
2260 }
2261 }
2262
2263 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2264 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2265
2266 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2267 ISD::PARTIAL_REDUCE_UMLA};
2268 unsigned NumElts = VT.getVectorNumElements();
2269 if (VT.getVectorElementType() == MVT::i64) {
2270 setPartialReduceMLAAction(MLAOps, VT,
2271 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2272 setPartialReduceMLAAction(MLAOps, VT,
2273 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2274 setPartialReduceMLAAction(MLAOps, VT,
2275 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2276 } else if (VT.getVectorElementType() == MVT::i32) {
2277 setPartialReduceMLAAction(MLAOps, VT,
2278 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2279 setPartialReduceMLAAction(MLAOps, VT,
2280 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2281 } else if (VT.getVectorElementType() == MVT::i16) {
2282 setPartialReduceMLAAction(MLAOps, VT,
2283 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2284 }
2285 if (Subtarget->hasMatMulInt8()) {
2286 if (VT.getVectorElementType() == MVT::i32)
2287 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2288 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2289 else if (VT.getVectorElementType() == MVT::i64)
2290 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2291 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2292 }
2293
2294 // Lower fixed length vector operations to scalable equivalents.
2301 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2311 setOperationAction(ISD::FABS, VT, Default);
2313 setOperationAction(ISD::FCEIL, VT, Default);
2316 setOperationAction(ISD::FFLOOR, VT, Default);
2318 setOperationAction(ISD::FMAXIMUM, VT, Default);
2319 setOperationAction(ISD::FMAXNUM, VT, Default);
2320 setOperationAction(ISD::FMINIMUM, VT, Default);
2321 setOperationAction(ISD::FMINNUM, VT, Default);
2323 setOperationAction(ISD::FNEARBYINT, VT, Default);
2324 setOperationAction(ISD::FNEG, VT, Default);
2325 setOperationAction(ISD::FP_EXTEND, VT, Default);
2329 setOperationAction(ISD::FRINT, VT, Default);
2330 setOperationAction(ISD::LRINT, VT, Default);
2331 setOperationAction(ISD::LLRINT, VT, Default);
2332 setOperationAction(ISD::FROUND, VT, Default);
2333 setOperationAction(ISD::FROUNDEVEN, VT, Default);
2334 setOperationAction(ISD::FSQRT, VT, Default);
2336 setOperationAction(ISD::FTRUNC, VT, Default);
2337 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);
2339 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2340 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2341 setOperationAction(ISD::MLOAD, VT, Default);
2342 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2343 setOperationAction(ISD::MSTORE, VT, Default);
2361 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2368 setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2369 setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2370 setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2371 setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2372 setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2373 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2374 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2375 setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2376 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2377 setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2378 setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2379 setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2380 setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2381 setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2387}
2388
2389void AArch64TargetLowering::addDRType(MVT VT) {
2390 addRegisterClass(VT, &AArch64::FPR64RegClass);
2391 if (Subtarget->isNeonAvailable())
2392 addTypeForNEON(VT);
2393}
2394
2395void AArch64TargetLowering::addQRType(MVT VT) {
2396 addRegisterClass(VT, &AArch64::FPR128RegClass);
2397 if (Subtarget->isNeonAvailable())
2398 addTypeForNEON(VT);
2399}
2400
2402 LLVMContext &C, EVT VT) const {
2403 if (!VT.isVector())
2404 return MVT::i32;
2405 if (VT.isScalableVector())
2406 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2408}
2409
2410// isIntImmediate - This method tests to see if the node is a constant
2411// operand. If so Imm will receive the value.
2412static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2414 Imm = C->getZExtValue();
2415 return true;
2416 }
2417 return false;
2418}
2419
2420bool isVectorizedBinOp(unsigned Opcode) {
2421 switch (Opcode) {
2422 case AArch64ISD::SQDMULH:
2423 return true;
2424 default:
2425 return false;
2426 }
2427}
2428
2429// isOpcWithIntImmediate - This method tests to see if the node is a specific
2430// opcode and that it has a immediate integer right operand.
2431// If so Imm will receive the value.
2432static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2433 uint64_t &Imm) {
2434 return N->getOpcode() == Opc &&
2435 isIntImmediate(N->getOperand(1).getNode(), Imm);
2436}
2437
2438static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2439 const APInt &Demanded,
2441 unsigned NewOpc) {
2442 uint64_t OldImm = Imm, NewImm, Enc;
2443 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2444
2445 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2446 // bimm64.
2447 if (Imm == 0 || Imm == Mask ||
2449 return false;
2450
2451 unsigned EltSize = Size;
2452 uint64_t DemandedBits = Demanded.getZExtValue();
2453
2454 // Clear bits that are not demanded.
2455 Imm &= DemandedBits;
2456
2457 while (true) {
2458 // The goal here is to set the non-demanded bits in a way that minimizes
2459 // the number of switching between 0 and 1. In order to achieve this goal,
2460 // we set the non-demanded bits to the value of the preceding demanded bits.
2461 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2462 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2463 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2464 // The final result is 0b11000011.
2465 uint64_t NonDemandedBits = ~DemandedBits;
2466 uint64_t InvertedImm = ~Imm & DemandedBits;
2467 uint64_t RotatedImm =
2468 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2469 NonDemandedBits;
2470 uint64_t Sum = RotatedImm + NonDemandedBits;
2471 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2472 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2473 NewImm = (Imm | Ones) & Mask;
2474
2475 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2476 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2477 // we halve the element size and continue the search.
2478 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2479 break;
2480
2481 // We cannot shrink the element size any further if it is 2-bits.
2482 if (EltSize == 2)
2483 return false;
2484
2485 EltSize /= 2;
2486 Mask >>= EltSize;
2487 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2488
2489 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2490 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2491 return false;
2492
2493 // Merge the upper and lower halves of Imm and DemandedBits.
2494 Imm |= Hi;
2495 DemandedBits |= DemandedBitsHi;
2496 }
2497
2498 ++NumOptimizedImms;
2499
2500 // Replicate the element across the register width.
2501 while (EltSize < Size) {
2502 NewImm |= NewImm << EltSize;
2503 EltSize *= 2;
2504 }
2505
2506 (void)OldImm;
2507 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2508 "demanded bits should never be altered");
2509 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2510
2511 // Create the new constant immediate node.
2512 EVT VT = Op.getValueType();
2513 SDLoc DL(Op);
2514 SDValue New;
2515
2516 // If the new constant immediate is all-zeros or all-ones, let the target
2517 // independent DAG combine optimize this node.
2518 if (NewImm == 0 || NewImm == OrigMask) {
2519 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2520 TLO.DAG.getConstant(NewImm, DL, VT));
2521 // Otherwise, create a machine node so that target independent DAG combine
2522 // doesn't undo this optimization.
2523 } else {
2525 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2526 New = SDValue(
2527 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2528 }
2529
2530 return TLO.CombineTo(Op, New);
2531}
2532
2534 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2535 TargetLoweringOpt &TLO) const {
2536 // Delay this optimization to as late as possible.
2537 if (!TLO.LegalOps)
2538 return false;
2539
2541 return false;
2542
2543 EVT VT = Op.getValueType();
2544 if (VT.isVector())
2545 return false;
2546
2547 unsigned Size = VT.getSizeInBits();
2548
2549 if (Size != 32 && Size != 64)
2550 return false;
2551
2552 // Exit early if we demand all bits.
2553 if (DemandedBits.popcount() == Size)
2554 return false;
2555
2556 unsigned NewOpc;
2557 switch (Op.getOpcode()) {
2558 default:
2559 return false;
2560 case ISD::AND:
2561 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2562 break;
2563 case ISD::OR:
2564 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2565 break;
2566 case ISD::XOR:
2567 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2568 break;
2569 }
2570 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2571 if (!C)
2572 return false;
2573 uint64_t Imm = C->getZExtValue();
2574 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2575}
2576
2577/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2578/// Mask are known to be either zero or one and return them Known.
2580 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2581 const SelectionDAG &DAG, unsigned Depth) const {
2582 switch (Op.getOpcode()) {
2583 default:
2584 break;
2585 case AArch64ISD::DUP: {
2586 SDValue SrcOp = Op.getOperand(0);
2587 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2588 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2589 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2590 "Expected DUP implicit truncation");
2591 Known = Known.trunc(Op.getScalarValueSizeInBits());
2592 }
2593 break;
2594 }
2595 case AArch64ISD::CSEL: {
2596 KnownBits Known2;
2597 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2598 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2599 Known = Known.intersectWith(Known2);
2600 break;
2601 }
2602 case AArch64ISD::CSNEG:
2603 case AArch64ISD::CSINC:
2604 case AArch64ISD::CSINV: {
2605 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2606 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2607
2608 // The result is either:
2609 // CSINC: KnownOp0 or KnownOp1 + 1
2610 // CSINV: KnownOp0 or ~KnownOp1
2611 // CSNEG: KnownOp0 or KnownOp1 * -1
2612 if (Op.getOpcode() == AArch64ISD::CSINC)
2613 KnownOp1 = KnownBits::add(
2614 KnownOp1,
2615 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2616 else if (Op.getOpcode() == AArch64ISD::CSINV)
2617 std::swap(KnownOp1.Zero, KnownOp1.One);
2618 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2619 KnownOp1 =
2621 Op.getScalarValueSizeInBits())));
2622
2623 Known = KnownOp0.intersectWith(KnownOp1);
2624 break;
2625 }
2626 case AArch64ISD::BICi: {
2627 // Compute the bit cleared value.
2628 APInt Mask =
2629 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2630 .trunc(Known.getBitWidth());
2631 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2632 Known &= KnownBits::makeConstant(Mask);
2633 break;
2634 }
2635 case AArch64ISD::VLSHR: {
2636 KnownBits Known2;
2637 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2638 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2639 Known = KnownBits::lshr(Known, Known2);
2640 break;
2641 }
2642 case AArch64ISD::VASHR: {
2643 KnownBits Known2;
2644 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2645 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2646 Known = KnownBits::ashr(Known, Known2);
2647 break;
2648 }
2649 case AArch64ISD::VSHL: {
2650 KnownBits Known2;
2651 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2652 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2653 Known = KnownBits::shl(Known, Known2);
2654 break;
2655 }
2656 case AArch64ISD::MOVI: {
2658 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2659 break;
2660 }
2661 case AArch64ISD::MOVIshift: {
2663 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2664 << Op->getConstantOperandVal(1)));
2665 break;
2666 }
2667 case AArch64ISD::MOVImsl: {
2668 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2670 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2671 break;
2672 }
2673 case AArch64ISD::MOVIedit: {
2675 Known.getBitWidth(),
2676 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2677 break;
2678 }
2679 case AArch64ISD::MVNIshift: {
2681 APInt(Known.getBitWidth(),
2682 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2683 /*isSigned*/ false, /*implicitTrunc*/ true));
2684 break;
2685 }
2686 case AArch64ISD::MVNImsl: {
2687 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2689 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2690 /*isSigned*/ false, /*implicitTrunc*/ true));
2691 break;
2692 }
2693 case AArch64ISD::LOADgot:
2694 case AArch64ISD::ADDlow: {
2695 if (!Subtarget->isTargetILP32())
2696 break;
2697 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2698 Known.Zero = APInt::getHighBitsSet(64, 32);
2699 break;
2700 }
2701 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2702 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2703 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2704 break;
2705 }
2707 Intrinsic::ID IntID =
2708 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2709 switch (IntID) {
2710 default: return;
2711 case Intrinsic::aarch64_ldaxr:
2712 case Intrinsic::aarch64_ldxr: {
2713 unsigned BitWidth = Known.getBitWidth();
2714 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2715 unsigned MemBits = VT.getScalarSizeInBits();
2716 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2717 return;
2718 }
2719 }
2720 break;
2721 }
2723 case ISD::INTRINSIC_VOID: {
2724 unsigned IntNo = Op.getConstantOperandVal(0);
2725 switch (IntNo) {
2726 default:
2727 break;
2728 case Intrinsic::aarch64_neon_uaddlv: {
2729 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2730 unsigned BitWidth = Known.getBitWidth();
2731 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2732 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2733 assert(BitWidth >= Bound && "Unexpected width!");
2735 Known.Zero |= Mask;
2736 }
2737 break;
2738 }
2739 case Intrinsic::aarch64_neon_umaxv:
2740 case Intrinsic::aarch64_neon_uminv: {
2741 // Figure out the datatype of the vector operand. The UMINV instruction
2742 // will zero extend the result, so we can mark as known zero all the
2743 // bits larger than the element datatype. 32-bit or larget doesn't need
2744 // this as those are legal types and will be handled by isel directly.
2745 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2746 unsigned BitWidth = Known.getBitWidth();
2747 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2748 assert(BitWidth >= 8 && "Unexpected width!");
2750 Known.Zero |= Mask;
2751 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2752 assert(BitWidth >= 16 && "Unexpected width!");
2754 Known.Zero |= Mask;
2755 }
2756 break;
2757 } break;
2758 }
2759 }
2760 }
2761}
2762
2764 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2765 unsigned Depth) const {
2766 EVT VT = Op.getValueType();
2767 unsigned VTBits = VT.getScalarSizeInBits();
2768 unsigned Opcode = Op.getOpcode();
2769 switch (Opcode) {
2770 case AArch64ISD::FCMEQ:
2771 case AArch64ISD::FCMGE:
2772 case AArch64ISD::FCMGT:
2773 // Compares return either 0 or all-ones
2774 return VTBits;
2775 case AArch64ISD::VASHR: {
2776 unsigned Tmp =
2777 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2778 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2779 }
2780 }
2781
2782 return 1;
2783}
2784
2786 EVT) const {
2787 return MVT::i64;
2788}
2789
2791 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2792 unsigned *Fast) const {
2793
2794 // Allow SVE loads/stores where the alignment >= the size of the element type,
2795 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2796 // for stores that come from IR, only require element-size alignment (even if
2797 // unaligned accesses are disabled). Without this, these will be forced to
2798 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2799 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2800 if (VT.isScalableVector()) {
2801 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2802 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2803 return true;
2804 }
2805
2806 if (Subtarget->requiresStrictAlign())
2807 return false;
2808
2809 if (Fast) {
2810 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2811 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2812 // See comments in performSTORECombine() for more details about
2813 // these conditions.
2814
2815 // Code that uses clang vector extensions can mark that it
2816 // wants unaligned accesses to be treated as fast by
2817 // underspecifying alignment to be 1 or 2.
2818 Alignment <= 2 ||
2819
2820 // Disregard v2i64. Memcpy lowering produces those and splitting
2821 // them regresses performance on micro-benchmarks and olden/bh.
2822 VT == MVT::v2i64;
2823 }
2824 return true;
2825}
2826
2827// Same as above but handling LLTs instead.
2829 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2830 unsigned *Fast) const {
2831 if (Subtarget->requiresStrictAlign())
2832 return false;
2833
2834 if (Fast) {
2835 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2836 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2837 Ty.getSizeInBytes() != 16 ||
2838 // See comments in performSTORECombine() for more details about
2839 // these conditions.
2840
2841 // Code that uses clang vector extensions can mark that it
2842 // wants unaligned accesses to be treated as fast by
2843 // underspecifying alignment to be 1 or 2.
2844 Alignment <= 2 ||
2845
2846 // Disregard v2i64. Memcpy lowering produces those and splitting
2847 // them regresses performance on micro-benchmarks and olden/bh.
2848 Ty == LLT::fixed_vector(2, 64);
2849 }
2850 return true;
2851}
2852
2853FastISel *
2855 const TargetLibraryInfo *libInfo) const {
2856 return AArch64::createFastISel(funcInfo, libInfo);
2857}
2858
2861 MachineBasicBlock *MBB) const {
2862 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2863 // phi node:
2864
2865 // OrigBB:
2866 // [... previous instrs leading to comparison ...]
2867 // b.ne TrueBB
2868 // b EndBB
2869 // TrueBB:
2870 // ; Fallthrough
2871 // EndBB:
2872 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2873
2874 MachineFunction *MF = MBB->getParent();
2875 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2876 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2877 DebugLoc DL = MI.getDebugLoc();
2878 MachineFunction::iterator It = ++MBB->getIterator();
2879
2880 Register DestReg = MI.getOperand(0).getReg();
2881 Register IfTrueReg = MI.getOperand(1).getReg();
2882 Register IfFalseReg = MI.getOperand(2).getReg();
2883 unsigned CondCode = MI.getOperand(3).getImm();
2884 bool NZCVKilled = MI.getOperand(4).isKill();
2885
2886 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2887 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2888 MF->insert(It, TrueBB);
2889 MF->insert(It, EndBB);
2890
2891 // Transfer rest of current basic-block to EndBB
2892 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2893 MBB->end());
2895
2896 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2897 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2898 MBB->addSuccessor(TrueBB);
2899 MBB->addSuccessor(EndBB);
2900
2901 // TrueBB falls through to the end.
2902 TrueBB->addSuccessor(EndBB);
2903
2904 if (!NZCVKilled) {
2905 TrueBB->addLiveIn(AArch64::NZCV);
2906 EndBB->addLiveIn(AArch64::NZCV);
2907 }
2908
2909 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2910 .addReg(IfTrueReg)
2911 .addMBB(TrueBB)
2912 .addReg(IfFalseReg)
2913 .addMBB(MBB);
2914
2915 MI.eraseFromParent();
2916 return EndBB;
2917}
2918
2926
2929 MachineBasicBlock *MBB) const {
2930 MachineFunction &MF = *MBB->getParent();
2931 MachineBasicBlock::iterator MBBI = MI.getIterator();
2932 const AArch64InstrInfo &TII =
2933 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2934 Register TargetReg = MI.getOperand(0).getReg();
2936 TII.probedStackAlloc(MBBI, TargetReg, false);
2937
2938 MI.eraseFromParent();
2939 return NextInst->getParent();
2940}
2941
2944 MachineBasicBlock *MBB) const {
2945 MachineFunction *MF = MBB->getParent();
2947
2948 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
2949 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
2950
2951 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
2952 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
2953 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
2954 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
2955
2956 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2957 DebugLoc DL = MI.getDebugLoc();
2958
2959 // RDVL requires GPR64, ADDSVL requires GPR64sp
2960 // We need to insert COPY instructions, these will later be removed by the
2961 // RegisterCoalescer
2962 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
2963 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
2964 .addReg(RegVL_GPR);
2965
2966 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
2967 .addReg(RegVL_GPRsp)
2968 .addImm(-1);
2969 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
2970 .addReg(RegSVL_GPRsp);
2971
2972 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2973 MachineFunction::iterator It = ++MBB->getIterator();
2974 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
2975 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
2976 MF->insert(It, TrapBB);
2977 MF->insert(It, PassBB);
2978
2979 // Continue if vector lengths match
2980 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
2981 .addReg(RegSVL_GPR)
2982 .addMBB(PassBB);
2983
2984 // Transfer rest of current BB to PassBB
2985 PassBB->splice(PassBB->begin(), MBB,
2986 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2988
2989 // Trap if vector lengths mismatch
2990 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
2991
2992 MBB->addSuccessor(TrapBB);
2993 MBB->addSuccessor(PassBB);
2994
2995 MI.eraseFromParent();
2996 return PassBB;
2997}
2998
3000AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3002 MachineBasicBlock *BB) const {
3003 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3004 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3005
3006 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3007 MIB.add(MI.getOperand(1)); // slice index register
3008 MIB.add(MI.getOperand(2)); // slice index offset
3009 MIB.add(MI.getOperand(3)); // pg
3010 MIB.add(MI.getOperand(4)); // base
3011 MIB.add(MI.getOperand(5)); // offset
3012
3013 MI.eraseFromParent(); // The pseudo is gone now.
3014 return BB;
3015}
3016
3019 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3021 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3022
3023 MIB.addReg(AArch64::ZA, RegState::Define);
3024 MIB.add(MI.getOperand(0)); // Vector select register
3025 MIB.add(MI.getOperand(1)); // Vector select offset
3026 MIB.add(MI.getOperand(2)); // Base
3027 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3028
3029 MI.eraseFromParent(); // The pseudo is gone now.
3030 return BB;
3031}
3032
3035 unsigned Opcode,
3036 bool Op0IsDef) const {
3037 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3039
3040 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3041 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3042 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3043 MIB.add(MI.getOperand(I));
3044
3045 MI.eraseFromParent(); // The pseudo is gone now.
3046 return BB;
3047}
3048
3050AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3052 MachineBasicBlock *BB) const {
3053 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3054 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3055 unsigned StartIdx = 0;
3056
3057 bool HasTile = BaseReg != AArch64::ZA;
3058 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3059 if (HasZPROut) {
3060 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3061 ++StartIdx;
3062 }
3063 if (HasTile) {
3064 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3065 RegState::Define); // Output ZA Tile
3066 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3067 StartIdx++;
3068 } else {
3069 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3070 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3071 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3072 ++StartIdx;
3073 }
3074 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3075 }
3076 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3077 MIB.add(MI.getOperand(I));
3078
3079 MI.eraseFromParent(); // The pseudo is gone now.
3080 return BB;
3081}
3082
3085 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3087 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3088 MIB.add(MI.getOperand(0)); // Mask
3089
3090 unsigned Mask = MI.getOperand(0).getImm();
3091 for (unsigned I = 0; I < 8; I++) {
3092 if (Mask & (1 << I))
3093 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3094 }
3095
3096 MI.eraseFromParent(); // The pseudo is gone now.
3097 return BB;
3098}
3099
3102 MachineBasicBlock *BB) const {
3103 MachineFunction *MF = BB->getParent();
3104 MachineFrameInfo &MFI = MF->getFrameInfo();
3106 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3107 if (TPIDR2.Uses > 0) {
3108 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3109 // generally don't support big-endian SVE/SME.
3110 if (!Subtarget->isLittleEndian())
3112 "TPIDR2 block initialization is not supported on big-endian targets");
3113
3114 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3115 // Store buffer pointer and num_za_save_slices.
3116 // Bytes 10-15 are implicitly zeroed.
3117 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3118 .addReg(MI.getOperand(0).getReg())
3119 .addReg(MI.getOperand(1).getReg())
3120 .addFrameIndex(TPIDR2.FrameIndex)
3121 .addImm(0);
3122 } else
3123 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3124
3125 BB->remove_instr(&MI);
3126 return BB;
3127}
3128
3131 MachineBasicBlock *BB) const {
3132 MachineFunction *MF = BB->getParent();
3133 MachineFrameInfo &MFI = MF->getFrameInfo();
3135 // TODO This function grows the stack with a subtraction, which doesn't work
3136 // on Windows. Some refactoring to share the functionality in
3137 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3138 // supports SME
3140 "Lazy ZA save is not yet supported on Windows");
3141
3142 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3143
3144 if (TPIDR2.Uses > 0) {
3145 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3147
3148 // The SUBXrs below won't always be emitted in a form that accepts SP
3149 // directly
3150 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3151 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3152 .addReg(AArch64::SP);
3153
3154 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3155 auto Size = MI.getOperand(1).getReg();
3156 auto Dest = MI.getOperand(0).getReg();
3157 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3158 .addReg(Size)
3159 .addReg(Size)
3160 .addReg(SP);
3161 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3162 AArch64::SP)
3163 .addReg(Dest);
3164
3165 // We have just allocated a variable sized object, tell this to PEI.
3166 MFI.CreateVariableSizedObject(Align(16), nullptr);
3167 }
3168
3169 BB->remove_instr(&MI);
3170 return BB;
3171}
3172
3173// TODO: Find a way to merge this with EmitAllocateZABuffer.
3176 MachineBasicBlock *BB) const {
3177 MachineFunction *MF = BB->getParent();
3178 MachineFrameInfo &MFI = MF->getFrameInfo();
3181 "Lazy ZA save is not yet supported on Windows");
3182
3183 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3184 if (FuncInfo->isSMESaveBufferUsed()) {
3185 // Allocate a buffer object of the size given by MI.getOperand(1).
3186 auto Size = MI.getOperand(1).getReg();
3187 auto Dest = MI.getOperand(0).getReg();
3188 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3189 .addReg(AArch64::SP)
3190 .addReg(Size)
3192 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3193 .addReg(AArch64::SP);
3194
3195 // We have just allocated a variable sized object, tell this to PEI.
3196 MFI.CreateVariableSizedObject(Align(16), nullptr);
3197 } else
3198 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3199 MI.getOperand(0).getReg());
3200
3201 BB->remove_instr(&MI);
3202 return BB;
3203}
3204
3207 MachineBasicBlock *BB) const {
3208 // If the buffer is used, emit a call to __arm_sme_state_size()
3209 MachineFunction *MF = BB->getParent();
3211 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3212 if (FuncInfo->isSMESaveBufferUsed()) {
3213 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3214 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3215 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3217 .addReg(AArch64::X0, RegState::ImplicitDefine)
3218 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3219 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3220 MI.getOperand(0).getReg())
3221 .addReg(AArch64::X0);
3222 } else
3223 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3224 MI.getOperand(0).getReg())
3225 .addReg(AArch64::XZR);
3226 BB->remove_instr(&MI);
3227 return BB;
3228}
3229
3232 MachineBasicBlock *BB) const {
3233 MachineFunction *MF = BB->getParent();
3234 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3235 const DebugLoc &DL = MI.getDebugLoc();
3236 Register ResultReg = MI.getOperand(0).getReg();
3237 if (MF->getRegInfo().use_empty(ResultReg)) {
3238 // Nothing to do. Pseudo erased below.
3239 } else if (Subtarget->hasSME()) {
3240 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3241 .addImm(AArch64SysReg::SVCR)
3242 .addReg(AArch64::VG, RegState::Implicit);
3243 } else {
3244 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3245 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3246 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3248 .addReg(AArch64::X0, RegState::ImplicitDefine)
3249 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3250 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3251 .addReg(AArch64::X0);
3252 }
3253 MI.eraseFromParent();
3254 return BB;
3255}
3256
3257// Helper function to find the instruction that defined a virtual register.
3258// If unable to find such instruction, returns nullptr.
3260 Register Reg) {
3261 while (Reg.isVirtual()) {
3262 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3263 assert(DefMI && "Virtual register definition not found");
3264 unsigned Opcode = DefMI->getOpcode();
3265
3266 if (Opcode == AArch64::COPY) {
3267 Reg = DefMI->getOperand(1).getReg();
3268 // Vreg is defined by copying from physreg.
3269 if (Reg.isPhysical())
3270 return DefMI;
3271 continue;
3272 }
3273 if (Opcode == AArch64::SUBREG_TO_REG) {
3274 Reg = DefMI->getOperand(2).getReg();
3275 continue;
3276 }
3277
3278 return DefMI;
3279 }
3280 return nullptr;
3281}
3282
3285 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3286 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3287 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3288 const DebugLoc &DL = MI.getDebugLoc();
3289
3290 Register AddrDisc = AddrDiscOp.getReg();
3291 int64_t IntDisc = IntDiscOp.getImm();
3292 assert(IntDisc == 0 && "Blend components are already expanded");
3293
3294 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3295 if (DiscMI) {
3296 switch (DiscMI->getOpcode()) {
3297 case AArch64::MOVKXi:
3298 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3299 // #imm should be an immediate and not a global symbol, for example.
3300 if (DiscMI->getOperand(2).isImm() &&
3301 DiscMI->getOperand(3).getImm() == 48) {
3302 AddrDisc = DiscMI->getOperand(1).getReg();
3303 IntDisc = DiscMI->getOperand(2).getImm();
3304 }
3305 break;
3306 case AArch64::MOVi32imm:
3307 case AArch64::MOVi64imm:
3308 // Small immediate integer constant passed via VReg.
3309 if (DiscMI->getOperand(1).isImm() &&
3310 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3311 AddrDisc = AArch64::NoRegister;
3312 IntDisc = DiscMI->getOperand(1).getImm();
3313 }
3314 break;
3315 }
3316 }
3317
3318 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3319 // in the requested register class.
3320 if (AddrDisc == AArch64::XZR)
3321 AddrDisc = AArch64::NoRegister;
3322
3323 // Make sure AddrDisc operand respects the register class imposed by MI.
3324 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3325 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3326 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3327 AddrDisc = TmpReg;
3328 }
3329
3330 AddrDiscOp.setReg(AddrDisc);
3331 IntDiscOp.setImm(IntDisc);
3332}
3333
3335 MachineInstr &MI, MachineBasicBlock *BB) const {
3336
3337 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3338 if (SMEOrigInstr != -1) {
3339 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3340 uint64_t SMEMatrixType =
3341 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3342 switch (SMEMatrixType) {
3344 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3346 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3348 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3350 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3352 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3354 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3355 }
3356 }
3357
3358 switch (MI.getOpcode()) {
3359 default:
3360#ifndef NDEBUG
3361 MI.dump();
3362#endif
3363 llvm_unreachable("Unexpected instruction for custom inserter!");
3364 case AArch64::InitTPIDR2Obj:
3365 return EmitInitTPIDR2Object(MI, BB);
3366 case AArch64::AllocateZABuffer:
3367 return EmitAllocateZABuffer(MI, BB);
3368 case AArch64::AllocateSMESaveBuffer:
3369 return EmitAllocateSMESaveBuffer(MI, BB);
3370 case AArch64::GetSMESaveSize:
3371 return EmitGetSMESaveSize(MI, BB);
3372 case AArch64::EntryPStateSM:
3373 return EmitEntryPStateSM(MI, BB);
3374 case AArch64::F128CSEL:
3375 return EmitF128CSEL(MI, BB);
3376 case TargetOpcode::STATEPOINT:
3377 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3378 // while bl call instruction (where statepoint will be lowered at the end)
3379 // has implicit def. This def is early-clobber as it will be set at
3380 // the moment of the call and earlier than any use is read.
3381 // Add this implicit dead def here as a workaround.
3382 MI.addOperand(*MI.getMF(),
3384 AArch64::LR, /*isDef*/ true,
3385 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3386 /*isUndef*/ false, /*isEarlyClobber*/ true));
3387 [[fallthrough]];
3388 case TargetOpcode::STACKMAP:
3389 case TargetOpcode::PATCHPOINT:
3390 return emitPatchPoint(MI, BB);
3391
3392 case TargetOpcode::PATCHABLE_EVENT_CALL:
3393 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3394 return BB;
3395
3396 case AArch64::CATCHRET:
3397 return EmitLoweredCatchRet(MI, BB);
3398
3399 case AArch64::PROBED_STACKALLOC_DYN:
3400 return EmitDynamicProbedAlloc(MI, BB);
3401
3402 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3403 return EmitCheckMatchingVL(MI, BB);
3404
3405 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3406 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3407 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3408 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3409 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3410 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3411 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3412 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3413 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3414 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3415 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3416 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3417 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3418 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3419 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3420 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3421 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3422 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3423 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3424 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3425 case AArch64::LDR_ZA_PSEUDO:
3426 return EmitFill(MI, BB);
3427 case AArch64::LDR_TX_PSEUDO:
3428 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3429 case AArch64::STR_TX_PSEUDO:
3430 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3431 case AArch64::ZERO_M_PSEUDO:
3432 return EmitZero(MI, BB);
3433 case AArch64::ZERO_T_PSEUDO:
3434 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3435 case AArch64::MOVT_TIZ_PSEUDO:
3436 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3437
3438 case AArch64::PAC:
3439 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3440 &AArch64::GPR64noipRegClass);
3441 return BB;
3442 }
3443}
3444
3445//===----------------------------------------------------------------------===//
3446// AArch64 Lowering private implementation.
3447//===----------------------------------------------------------------------===//
3448
3449//===----------------------------------------------------------------------===//
3450// Lowering Code
3451//===----------------------------------------------------------------------===//
3452
3453// Forward declarations of SVE fixed length lowering helpers
3458 SelectionDAG &DAG);
3461 EVT VT);
3462
3463/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3464static bool isZerosVector(const SDNode *N) {
3465 // Look through a bit convert.
3466 while (N->getOpcode() == ISD::BITCAST)
3467 N = N->getOperand(0).getNode();
3468
3470 return true;
3471
3472 if (N->getOpcode() != AArch64ISD::DUP)
3473 return false;
3474
3475 auto Opnd0 = N->getOperand(0);
3476 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3477}
3478
3479/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3480/// CC
3482 SDValue RHS = {}) {
3483 switch (CC) {
3484 default:
3485 llvm_unreachable("Unknown condition code!");
3486 case ISD::SETNE:
3487 return AArch64CC::NE;
3488 case ISD::SETEQ:
3489 return AArch64CC::EQ;
3490 case ISD::SETGT:
3491 return AArch64CC::GT;
3492 case ISD::SETGE:
3494 case ISD::SETLT:
3496 case ISD::SETLE:
3497 return AArch64CC::LE;
3498 case ISD::SETUGT:
3499 return AArch64CC::HI;
3500 case ISD::SETUGE:
3501 return AArch64CC::HS;
3502 case ISD::SETULT:
3503 return AArch64CC::LO;
3504 case ISD::SETULE:
3505 return AArch64CC::LS;
3506 }
3507}
3508
3509/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3511 AArch64CC::CondCode &CondCode,
3512 AArch64CC::CondCode &CondCode2) {
3513 CondCode2 = AArch64CC::AL;
3514 switch (CC) {
3515 default:
3516 llvm_unreachable("Unknown FP condition!");
3517 case ISD::SETEQ:
3518 case ISD::SETOEQ:
3519 CondCode = AArch64CC::EQ;
3520 break;
3521 case ISD::SETGT:
3522 case ISD::SETOGT:
3523 CondCode = AArch64CC::GT;
3524 break;
3525 case ISD::SETGE:
3526 case ISD::SETOGE:
3527 CondCode = AArch64CC::GE;
3528 break;
3529 case ISD::SETOLT:
3530 CondCode = AArch64CC::MI;
3531 break;
3532 case ISD::SETOLE:
3533 CondCode = AArch64CC::LS;
3534 break;
3535 case ISD::SETONE:
3536 CondCode = AArch64CC::MI;
3537 CondCode2 = AArch64CC::GT;
3538 break;
3539 case ISD::SETO:
3540 CondCode = AArch64CC::VC;
3541 break;
3542 case ISD::SETUO:
3543 CondCode = AArch64CC::VS;
3544 break;
3545 case ISD::SETUEQ:
3546 CondCode = AArch64CC::EQ;
3547 CondCode2 = AArch64CC::VS;
3548 break;
3549 case ISD::SETUGT:
3550 CondCode = AArch64CC::HI;
3551 break;
3552 case ISD::SETUGE:
3553 CondCode = AArch64CC::PL;
3554 break;
3555 case ISD::SETLT:
3556 case ISD::SETULT:
3557 CondCode = AArch64CC::LT;
3558 break;
3559 case ISD::SETLE:
3560 case ISD::SETULE:
3561 CondCode = AArch64CC::LE;
3562 break;
3563 case ISD::SETNE:
3564 case ISD::SETUNE:
3565 CondCode = AArch64CC::NE;
3566 break;
3567 }
3568}
3569
3570/// Convert a DAG fp condition code to an AArch64 CC.
3571/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3572/// should be AND'ed instead of OR'ed.
3574 AArch64CC::CondCode &CondCode,
3575 AArch64CC::CondCode &CondCode2) {
3576 CondCode2 = AArch64CC::AL;
3577 switch (CC) {
3578 default:
3579 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3580 assert(CondCode2 == AArch64CC::AL);
3581 break;
3582 case ISD::SETONE:
3583 // (a one b)
3584 // == ((a olt b) || (a ogt b))
3585 // == ((a ord b) && (a une b))
3586 CondCode = AArch64CC::VC;
3587 CondCode2 = AArch64CC::NE;
3588 break;
3589 case ISD::SETUEQ:
3590 // (a ueq b)
3591 // == ((a uno b) || (a oeq b))
3592 // == ((a ule b) && (a uge b))
3593 CondCode = AArch64CC::PL;
3594 CondCode2 = AArch64CC::LE;
3595 break;
3596 }
3597}
3598
3599/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3600/// CC usable with the vector instructions. Fewer operations are available
3601/// without a real NZCV register, so we have to use less efficient combinations
3602/// to get the same effect.
3604 AArch64CC::CondCode &CondCode,
3605 AArch64CC::CondCode &CondCode2,
3606 bool &Invert) {
3607 Invert = false;
3608 switch (CC) {
3609 default:
3610 // Mostly the scalar mappings work fine.
3611 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3612 break;
3613 case ISD::SETUO:
3614 Invert = true;
3615 [[fallthrough]];
3616 case ISD::SETO:
3617 CondCode = AArch64CC::MI;
3618 CondCode2 = AArch64CC::GE;
3619 break;
3620 case ISD::SETUEQ:
3621 case ISD::SETULT:
3622 case ISD::SETULE:
3623 case ISD::SETUGT:
3624 case ISD::SETUGE:
3625 // All of the compare-mask comparisons are ordered, but we can switch
3626 // between the two by a double inversion. E.g. ULE == !OGT.
3627 Invert = true;
3628 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3629 CondCode, CondCode2);
3630 break;
3631 }
3632}
3633
3634/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3636 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3637 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3638}
3639
3641 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3642 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3643 LLVM_DEBUG(dbgs() << "Is imm " << C
3644 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3645 return IsLegal;
3646}
3647
3649 // Works for negative immediates too, as it can be written as an ADDS
3650 // instruction with a negated immediate.
3651 return isLegalArithImmed(C.abs().getZExtValue());
3652}
3653
3655 uint64_t Imm = C.getZExtValue();
3657 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3658 return Insn.size();
3659}
3660
3662 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3663 if (Op->getFlags().hasNoSignedWrap())
3664 return true;
3665
3666 // We can still figure out if the second operand is safe to use
3667 // in a CMN instruction by checking if it is known to be not the minimum
3668 // signed value. If it is not, then we can safely use CMN.
3669 // Note: We can eventually remove this check and simply rely on
3670 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3671 // consistently sets them appropriately when making said nodes.
3672
3673 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3674 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3675}
3676
3677// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3678// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3679// can be set differently by this operation. It comes down to whether
3680// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3681// everything is fine. If not then the optimization is wrong. Thus general
3682// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3683//
3684// So, finally, the only LLVM-native comparisons that don't mention C or V
3685// are the ones that aren't unsigned comparisons. They're the only ones we can
3686// safely use CMN for in the absence of information about op2.
3688 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3689 (isIntEqualitySetCC(CC) ||
3690 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3691 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3692}
3693
3695 SelectionDAG &DAG, SDValue Chain,
3696 bool IsSignaling) {
3697 EVT VT = LHS.getValueType();
3698 assert(VT != MVT::f128);
3699
3700 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3701
3702 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3703 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3704 {Chain, LHS});
3705 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3706 {LHS.getValue(1), RHS});
3707 Chain = RHS.getValue(1);
3708 }
3709 unsigned Opcode =
3710 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3711 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3712}
3713
3715 const SDLoc &DL, SelectionDAG &DAG) {
3716 EVT VT = LHS.getValueType();
3717 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3718
3719 if (VT.isFloatingPoint()) {
3720 assert(VT != MVT::f128);
3721 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3722 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3723 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3724 }
3725 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3726 }
3727
3728 // The CMP instruction is just an alias for SUBS, and representing it as
3729 // SUBS means that it's possible to get CSE with subtract operations.
3730 // A later phase can perform the optimization of setting the destination
3731 // register to WZR/XZR if it ends up being unused.
3732 unsigned Opcode = AArch64ISD::SUBS;
3733
3734 if (isCMN(RHS, CC, DAG)) {
3735 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3736 Opcode = AArch64ISD::ADDS;
3737 RHS = RHS.getOperand(1);
3738 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3739 isIntEqualitySetCC(CC)) {
3740 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3741 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3742 Opcode = AArch64ISD::ADDS;
3743 LHS = LHS.getOperand(1);
3744 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3745 if (LHS.getOpcode() == ISD::AND) {
3746 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3747 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3748 // of the signed comparisons.
3749 const SDValue ANDSNode =
3750 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3751 LHS.getOperand(0), LHS.getOperand(1));
3752 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3753 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3754 return ANDSNode.getValue(1);
3755 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3756 // Use result of ANDS
3757 return LHS.getValue(1);
3758 }
3759 }
3760
3761 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3762 .getValue(1);
3763}
3764
3765/// \defgroup AArch64CCMP CMP;CCMP matching
3766///
3767/// These functions deal with the formation of CMP;CCMP;... sequences.
3768/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3769/// a comparison. They set the NZCV flags to a predefined value if their
3770/// predicate is false. This allows to express arbitrary conjunctions, for
3771/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3772/// expressed as:
3773/// cmp A
3774/// ccmp B, inv(CB), CA
3775/// check for CB flags
3776///
3777/// This naturally lets us implement chains of AND operations with SETCC
3778/// operands. And we can even implement some other situations by transforming
3779/// them:
3780/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3781/// negating the flags used in a CCMP/FCCMP operations.
3782/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3783/// by negating the flags we test for afterwards. i.e.
3784/// NEG (CMP CCMP CCCMP ...) can be implemented.
3785/// - Note that we can only ever negate all previously processed results.
3786/// What we can not implement by flipping the flags to test is a negation
3787/// of two sub-trees (because the negation affects all sub-trees emitted so
3788/// far, so the 2nd sub-tree we emit would also affect the first).
3789/// With those tools we can implement some OR operations:
3790/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3791/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3792/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3793/// elimination rules from earlier to implement the whole thing as a
3794/// CCMP/FCCMP chain.
3795///
3796/// As complete example:
3797/// or (or (setCA (cmp A)) (setCB (cmp B)))
3798/// (and (setCC (cmp C)) (setCD (cmp D)))"
3799/// can be reassociated to:
3800/// or (and (setCC (cmp C)) setCD (cmp D))
3801// (or (setCA (cmp A)) (setCB (cmp B)))
3802/// can be transformed to:
3803/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3804/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3805/// which can be implemented as:
3806/// cmp C
3807/// ccmp D, inv(CD), CC
3808/// ccmp A, CA, inv(CD)
3809/// ccmp B, CB, inv(CA)
3810/// check for CB flags
3811///
3812/// A counterexample is "or (and A B) (and C D)" which translates to
3813/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3814/// can only implement 1 of the inner (not) operations, but not both!
3815/// @{
3816
3817/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3819 ISD::CondCode CC, SDValue CCOp,
3821 AArch64CC::CondCode OutCC,
3822 const SDLoc &DL, SelectionDAG &DAG) {
3823 unsigned Opcode = 0;
3824 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3825
3826 if (LHS.getValueType().isFloatingPoint()) {
3827 assert(LHS.getValueType() != MVT::f128);
3828 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3829 LHS.getValueType() == MVT::bf16) {
3830 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3831 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3832 }
3833 Opcode = AArch64ISD::FCCMP;
3834 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3835 APInt Imm = Const->getAPIntValue();
3836 if (Imm.isNegative() && Imm.sgt(-32)) {
3837 Opcode = AArch64ISD::CCMN;
3838 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3839 }
3840 } else if (isCMN(RHS, CC, DAG)) {
3841 Opcode = AArch64ISD::CCMN;
3842 RHS = RHS.getOperand(1);
3843 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3844 isIntEqualitySetCC(CC)) {
3845 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3846 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3847 Opcode = AArch64ISD::CCMN;
3848 LHS = LHS.getOperand(1);
3849 }
3850 if (Opcode == 0)
3851 Opcode = AArch64ISD::CCMP;
3852
3853 SDValue Condition = getCondCode(DAG, Predicate);
3855 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3856 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3857 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3858}
3859
3860/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3861/// expressed as a conjunction. See \ref AArch64CCMP.
3862/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3863/// changing the conditions on the SETCC tests.
3864/// (this means we can call emitConjunctionRec() with
3865/// Negate==true on this sub-tree)
3866/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3867/// cannot do the negation naturally. We are required to
3868/// emit the subtree first in this case.
3869/// \param WillNegate Is true if are called when the result of this
3870/// subexpression must be negated. This happens when the
3871/// outer expression is an OR. We can use this fact to know
3872/// that we have a double negation (or (or ...) ...) that
3873/// can be implemented for free.
3874static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3875 bool &MustBeFirst, bool WillNegate,
3876 unsigned Depth = 0) {
3877 if (!Val.hasOneUse())
3878 return false;
3879 unsigned Opcode = Val->getOpcode();
3880 if (Opcode == ISD::SETCC) {
3881 if (Val->getOperand(0).getValueType() == MVT::f128)
3882 return false;
3883 CanNegate = true;
3884 MustBeFirst = false;
3885 return true;
3886 }
3887 // Protect against exponential runtime and stack overflow.
3888 if (Depth > 6)
3889 return false;
3890 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3891 bool IsOR = Opcode == ISD::OR;
3892 SDValue O0 = Val->getOperand(0);
3893 SDValue O1 = Val->getOperand(1);
3894 bool CanNegateL;
3895 bool MustBeFirstL;
3896 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3897 return false;
3898 bool CanNegateR;
3899 bool MustBeFirstR;
3900 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3901 return false;
3902
3903 if (MustBeFirstL && MustBeFirstR)
3904 return false;
3905
3906 if (IsOR) {
3907 // For an OR expression we need to be able to naturally negate at least
3908 // one side or we cannot do the transformation at all.
3909 if (!CanNegateL && !CanNegateR)
3910 return false;
3911 // If we the result of the OR will be negated and we can naturally negate
3912 // the leafs, then this sub-tree as a whole negates naturally.
3913 CanNegate = WillNegate && CanNegateL && CanNegateR;
3914 // If we cannot naturally negate the whole sub-tree, then this must be
3915 // emitted first.
3916 MustBeFirst = !CanNegate;
3917 } else {
3918 assert(Opcode == ISD::AND && "Must be OR or AND");
3919 // We cannot naturally negate an AND operation.
3920 CanNegate = false;
3921 MustBeFirst = MustBeFirstL || MustBeFirstR;
3922 }
3923 return true;
3924 }
3925 return false;
3926}
3927
3928/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3929/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3930/// Tries to transform the given i1 producing node @p Val to a series compare
3931/// and conditional compare operations. @returns an NZCV flags producing node
3932/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3933/// transformation was not possible.
3934/// \p Negate is true if we want this sub-tree being negated just by changing
3935/// SETCC conditions.
3937 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3939 // We're at a tree leaf, produce a conditional comparison operation.
3940 unsigned Opcode = Val->getOpcode();
3941 if (Opcode == ISD::SETCC) {
3942 SDValue LHS = Val->getOperand(0);
3943 SDValue RHS = Val->getOperand(1);
3944 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3945 bool isInteger = LHS.getValueType().isInteger();
3946 if (Negate)
3947 CC = getSetCCInverse(CC, LHS.getValueType());
3948 SDLoc DL(Val);
3949 // Determine OutCC and handle FP special case.
3950 if (isInteger) {
3951 OutCC = changeIntCCToAArch64CC(CC, RHS);
3952 } else {
3953 assert(LHS.getValueType().isFloatingPoint());
3954 AArch64CC::CondCode ExtraCC;
3955 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3956 // Some floating point conditions can't be tested with a single condition
3957 // code. Construct an additional comparison in this case.
3958 if (ExtraCC != AArch64CC::AL) {
3959 SDValue ExtraCmp;
3960 if (!CCOp.getNode())
3961 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3962 else
3963 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3964 ExtraCC, DL, DAG);
3965 CCOp = ExtraCmp;
3966 Predicate = ExtraCC;
3967 }
3968 }
3969
3970 // Produce a normal comparison if we are first in the chain
3971 if (!CCOp)
3972 return emitComparison(LHS, RHS, CC, DL, DAG);
3973 // Otherwise produce a ccmp.
3974 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3975 DAG);
3976 }
3977 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3978
3979 bool IsOR = Opcode == ISD::OR;
3980
3981 SDValue LHS = Val->getOperand(0);
3982 bool CanNegateL;
3983 bool MustBeFirstL;
3984 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3985 assert(ValidL && "Valid conjunction/disjunction tree");
3986 (void)ValidL;
3987
3988 SDValue RHS = Val->getOperand(1);
3989 bool CanNegateR;
3990 bool MustBeFirstR;
3991 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3992 assert(ValidR && "Valid conjunction/disjunction tree");
3993 (void)ValidR;
3994
3995 // Swap sub-tree that must come first to the right side.
3996 if (MustBeFirstL) {
3997 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3998 std::swap(LHS, RHS);
3999 std::swap(CanNegateL, CanNegateR);
4000 std::swap(MustBeFirstL, MustBeFirstR);
4001 }
4002
4003 bool NegateR;
4004 bool NegateAfterR;
4005 bool NegateL;
4006 bool NegateAfterAll;
4007 if (Opcode == ISD::OR) {
4008 // Swap the sub-tree that we can negate naturally to the left.
4009 if (!CanNegateL) {
4010 assert(CanNegateR && "at least one side must be negatable");
4011 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4012 assert(!Negate);
4013 std::swap(LHS, RHS);
4014 NegateR = false;
4015 NegateAfterR = true;
4016 } else {
4017 // Negate the left sub-tree if possible, otherwise negate the result.
4018 NegateR = CanNegateR;
4019 NegateAfterR = !CanNegateR;
4020 }
4021 NegateL = true;
4022 NegateAfterAll = !Negate;
4023 } else {
4024 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4025 assert(!Negate && "Valid conjunction/disjunction tree");
4026
4027 NegateL = false;
4028 NegateR = false;
4029 NegateAfterR = false;
4030 NegateAfterAll = false;
4031 }
4032
4033 // Emit sub-trees.
4034 AArch64CC::CondCode RHSCC;
4035 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4036 if (NegateAfterR)
4037 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4038 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4039 if (NegateAfterAll)
4040 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4041 return CmpL;
4042}
4043
4044/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4045/// In some cases this is even possible with OR operations in the expression.
4046/// See \ref AArch64CCMP.
4047/// \see emitConjunctionRec().
4049 AArch64CC::CondCode &OutCC) {
4050 bool DummyCanNegate;
4051 bool DummyMustBeFirst;
4052 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
4053 return SDValue();
4054
4055 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4056}
4057
4058/// @}
4059
4060/// Returns how profitable it is to fold a comparison's operand's shift and/or
4061/// extension operations.
4063 auto isSupportedExtend = [&](SDValue V) {
4064 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4065 return true;
4066
4067 if (V.getOpcode() == ISD::AND)
4068 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4069 uint64_t Mask = MaskCst->getZExtValue();
4070 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4071 }
4072
4073 return false;
4074 };
4075
4076 if (!Op.hasOneUse())
4077 return 0;
4078
4079 if (isSupportedExtend(Op))
4080 return 1;
4081
4082 unsigned Opc = Op.getOpcode();
4083 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4084 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4085 uint64_t Shift = ShiftCst->getZExtValue();
4086 if (isSupportedExtend(Op.getOperand(0)))
4087 return (Shift <= 4) ? 2 : 1;
4088 EVT VT = Op.getValueType();
4089 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4090 return 1;
4091 }
4092
4093 return 0;
4094}
4095
4096// emitComparison() converts comparison with one or negative one to comparison
4097// with 0. Note that this only works for signed comparisons because of how ANDS
4098// works.
4100 // Only works for ANDS and AND.
4101 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4102 return false;
4103
4104 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4105 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4106 return true;
4107 }
4108
4109 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4110 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4111 return true;
4112 }
4113
4114 return false;
4115}
4116
4118 SDValue &AArch64cc, SelectionDAG &DAG,
4119 const SDLoc &DL) {
4120 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4121 EVT VT = RHS.getValueType();
4122 APInt C = RHSC->getAPIntValue();
4123 // shouldBeAdjustedToZero is a special case to better fold with
4124 // emitComparison().
4125 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4126 // Adjust the constant to zero.
4127 // CC has already been adjusted.
4128 RHS = DAG.getConstant(0, DL, VT);
4129 } else if (!isLegalCmpImmed(C)) {
4130 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4131 // Constant does not fit, try adjusting it by one?
4132 switch (CC) {
4133 default:
4134 break;
4135 case ISD::SETLT:
4136 case ISD::SETGE:
4137 if (!C.isMinSignedValue()) {
4138 APInt CMinusOne = C - 1;
4139 if (isLegalCmpImmed(CMinusOne) ||
4140 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4141 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4142 RHS = DAG.getConstant(CMinusOne, DL, VT);
4143 }
4144 }
4145 break;
4146 case ISD::SETULT:
4147 case ISD::SETUGE: {
4148 // C is not 0 because it is a legal immediate.
4149 assert(!C.isZero() && "C should not be zero here");
4150 APInt CMinusOne = C - 1;
4151 if (isLegalCmpImmed(CMinusOne) ||
4152 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4153 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4154 RHS = DAG.getConstant(CMinusOne, DL, VT);
4155 }
4156 break;
4157 }
4158 case ISD::SETLE:
4159 case ISD::SETGT:
4160 if (!C.isMaxSignedValue()) {
4161 APInt CPlusOne = C + 1;
4162 if (isLegalCmpImmed(CPlusOne) ||
4163 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4164 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4165 RHS = DAG.getConstant(CPlusOne, DL, VT);
4166 }
4167 }
4168 break;
4169 case ISD::SETULE:
4170 case ISD::SETUGT: {
4171 if (!C.isAllOnes()) {
4172 APInt CPlusOne = C + 1;
4173 if (isLegalCmpImmed(CPlusOne) ||
4174 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4175 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4176 RHS = DAG.getConstant(CPlusOne, DL, VT);
4177 }
4178 }
4179 break;
4180 }
4181 }
4182 }
4183 }
4184
4185 // Comparisons are canonicalized so that the RHS operand is simpler than the
4186 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4187 // can fold some shift+extend operations on the RHS operand, so swap the
4188 // operands if that can be done.
4189 //
4190 // For example:
4191 // lsl w13, w11, #1
4192 // cmp w13, w12
4193 // can be turned into:
4194 // cmp w12, w11, lsl #1
4195 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4196 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4197 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4198 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4199 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4200
4201 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4202 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4203 std::swap(LHS, RHS);
4205 }
4206 }
4207
4208 SDValue Cmp;
4210 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4212
4213 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4214 // For the i8 operand, the largest immediate is 255, so this can be easily
4215 // encoded in the compare instruction. For the i16 operand, however, the
4216 // largest immediate cannot be encoded in the compare.
4217 // Therefore, use a sign extending load and cmn to avoid materializing the
4218 // -1 constant. For example,
4219 // movz w1, #65535
4220 // ldrh w0, [x0, #0]
4221 // cmp w0, w1
4222 // >
4223 // ldrsh w0, [x0, #0]
4224 // cmn w0, #1
4225 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4226 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4227 // ensure both the LHS and RHS are truly zero extended and to make sure the
4228 // transformation is profitable.
4229 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4230 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4231 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4232 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4233 int16_t ValueofRHS = RHS->getAsZExtVal();
4234 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4235 SDValue SExt =
4236 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4237 DAG.getValueType(MVT::i16));
4238 Cmp = emitComparison(
4239 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4240 DL, DAG);
4242 }
4243 }
4244
4245 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4246 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4247 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4249 }
4250 }
4251 }
4252
4253 if (!Cmp) {
4254 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4256 }
4257 AArch64cc = getCondCode(DAG, AArch64CC);
4258 return Cmp;
4259}
4260
4261static std::pair<SDValue, SDValue>
4263 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4264 "Unsupported value type");
4265 SDValue Value, Overflow;
4266 SDLoc DL(Op);
4267 SDValue LHS = Op.getOperand(0);
4268 SDValue RHS = Op.getOperand(1);
4269 unsigned Opc = 0;
4270 switch (Op.getOpcode()) {
4271 default:
4272 llvm_unreachable("Unknown overflow instruction!");
4273 case ISD::SADDO:
4274 Opc = AArch64ISD::ADDS;
4275 CC = AArch64CC::VS;
4276 break;
4277 case ISD::UADDO:
4278 Opc = AArch64ISD::ADDS;
4279 CC = AArch64CC::HS;
4280 break;
4281 case ISD::SSUBO:
4282 Opc = AArch64ISD::SUBS;
4283 CC = AArch64CC::VS;
4284 break;
4285 case ISD::USUBO:
4286 Opc = AArch64ISD::SUBS;
4287 CC = AArch64CC::LO;
4288 break;
4289 // Multiply needs a little bit extra work.
4290 case ISD::SMULO:
4291 case ISD::UMULO: {
4292 CC = AArch64CC::NE;
4293 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4294 if (Op.getValueType() == MVT::i32) {
4295 // Extend to 64-bits, then perform a 64-bit multiply.
4296 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4297 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4298 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4299 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4300 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4301
4302 // Check that the result fits into a 32-bit integer.
4303 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4304 if (IsSigned) {
4305 // cmp xreg, wreg, sxtw
4306 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4307 Overflow =
4308 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4309 } else {
4310 // tst xreg, #0xffffffff00000000
4311 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4312 Overflow =
4313 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4314 }
4315 break;
4316 }
4317 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4318 // For the 64 bit multiply
4319 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4320 if (IsSigned) {
4321 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4322 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4323 DAG.getConstant(63, DL, MVT::i64));
4324 // It is important that LowerBits is last, otherwise the arithmetic
4325 // shift will not be folded into the compare (SUBS).
4326 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4327 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4328 .getValue(1);
4329 } else {
4330 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4331 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4332 Overflow =
4333 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4334 DAG.getConstant(0, DL, MVT::i64),
4335 UpperBits).getValue(1);
4336 }
4337 break;
4338 }
4339 } // switch (...)
4340
4341 if (Opc) {
4342 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4343
4344 // Emit the AArch64 operation with overflow check.
4345 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4346 Overflow = Value.getValue(1);
4347 }
4348 return std::make_pair(Value, Overflow);
4349}
4350
4351SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4352 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4353 !Subtarget->isNeonAvailable()))
4354 return LowerToScalableOp(Op, DAG);
4355
4356 SDValue Sel = Op.getOperand(0);
4357 SDValue Other = Op.getOperand(1);
4358 SDLoc DL(Sel);
4359
4360 // If the operand is an overflow checking operation, invert the condition
4361 // code and kill the Not operation. I.e., transform:
4362 // (xor (overflow_op_bool, 1))
4363 // -->
4364 // (csel 1, 0, invert(cc), overflow_op_bool)
4365 // ... which later gets transformed to just a cset instruction with an
4366 // inverted condition code, rather than a cset + eor sequence.
4368 // Only lower legal XALUO ops.
4370 return SDValue();
4371
4372 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4373 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4375 SDValue Value, Overflow;
4376 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4377 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4378 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4379 CCVal, Overflow);
4380 }
4381 // If neither operand is a SELECT_CC, give up.
4382 if (Sel.getOpcode() != ISD::SELECT_CC)
4383 std::swap(Sel, Other);
4384 if (Sel.getOpcode() != ISD::SELECT_CC)
4385 return Op;
4386
4387 // The folding we want to perform is:
4388 // (xor x, (select_cc a, b, cc, 0, -1) )
4389 // -->
4390 // (csel x, (xor x, -1), cc ...)
4391 //
4392 // The latter will get matched to a CSINV instruction.
4393
4394 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4395 SDValue LHS = Sel.getOperand(0);
4396 SDValue RHS = Sel.getOperand(1);
4397 SDValue TVal = Sel.getOperand(2);
4398 SDValue FVal = Sel.getOperand(3);
4399
4400 // FIXME: This could be generalized to non-integer comparisons.
4401 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4402 return Op;
4403
4404 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4405 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4406
4407 // The values aren't constants, this isn't the pattern we're looking for.
4408 if (!CFVal || !CTVal)
4409 return Op;
4410
4411 // We can commute the SELECT_CC by inverting the condition. This
4412 // might be needed to make this fit into a CSINV pattern.
4413 if (CTVal->isAllOnes() && CFVal->isZero()) {
4414 std::swap(TVal, FVal);
4415 std::swap(CTVal, CFVal);
4416 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4417 }
4418
4419 // If the constants line up, perform the transform!
4420 if (CTVal->isZero() && CFVal->isAllOnes()) {
4421 SDValue CCVal;
4422 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4423
4424 FVal = Other;
4425 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4426 DAG.getAllOnesConstant(DL, Other.getValueType()));
4427
4428 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4429 CCVal, Cmp);
4430 }
4431
4432 return Op;
4433}
4434
4435// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4436// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4437// sets 'C' bit to 0.
4439 SDLoc DL(Value);
4440 EVT VT = Value.getValueType();
4441 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4442 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4443 SDValue Cmp =
4444 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4445 return Cmp.getValue(1);
4446}
4447
4448// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4449// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4451 bool Invert) {
4452 assert(Glue.getResNo() == 1);
4453 SDLoc DL(Glue);
4454 SDValue Zero = DAG.getConstant(0, DL, VT);
4455 SDValue One = DAG.getConstant(1, DL, VT);
4457 SDValue CC = getCondCode(DAG, Cond);
4458 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4459}
4460
4461// Value is 1 if 'V' bit of NZCV is 1, else 0
4463 assert(Glue.getResNo() == 1);
4464 SDLoc DL(Glue);
4465 SDValue Zero = DAG.getConstant(0, DL, VT);
4466 SDValue One = DAG.getConstant(1, DL, VT);
4468 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4469}
4470
4471// This lowering is inefficient, but it will get cleaned up by
4472// `foldOverflowCheck`
4474 unsigned Opcode, bool IsSigned) {
4475 EVT VT0 = Op.getValue(0).getValueType();
4476 EVT VT1 = Op.getValue(1).getValueType();
4477
4478 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4479 return SDValue();
4480
4481 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4482 SDValue OpLHS = Op.getOperand(0);
4483 SDValue OpRHS = Op.getOperand(1);
4484 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4485
4486 SDLoc DL(Op);
4487
4488 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4489 OpRHS, OpCarryIn);
4490
4491 SDValue OutFlag =
4492 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4493 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4494
4495 return DAG.getMergeValues({Sum, OutFlag}, DL);
4496}
4497
4499 // Let legalize expand this if it isn't a legal type yet.
4500 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4501 return SDValue();
4502
4503 SDLoc DL(Op);
4505 // The actual operation that sets the overflow or carry flag.
4506 SDValue Value, Overflow;
4507 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4508
4509 // We use 0 and 1 as false and true values.
4510 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4511 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4512
4513 // We use an inverted condition, because the conditional select is inverted
4514 // too. This will allow it to be selected to a single instruction:
4515 // CSINC Wd, WZR, WZR, invert(cond).
4516 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4517 Overflow =
4518 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4519
4520 return DAG.getMergeValues({Value, Overflow}, DL);
4521}
4522
4523// Prefetch operands are:
4524// 1: Address to prefetch
4525// 2: bool isWrite
4526// 3: int locality (0 = no locality ... 3 = extreme locality)
4527// 4: bool isDataCache
4529 SDLoc DL(Op);
4530 unsigned IsWrite = Op.getConstantOperandVal(2);
4531 unsigned Locality = Op.getConstantOperandVal(3);
4532 unsigned IsData = Op.getConstantOperandVal(4);
4533
4534 bool IsStream = !Locality;
4535 // When the locality number is set
4536 if (Locality) {
4537 // The front-end should have filtered out the out-of-range values
4538 assert(Locality <= 3 && "Prefetch locality out-of-range");
4539 // The locality degree is the opposite of the cache speed.
4540 // Put the number the other way around.
4541 // The encoding starts at 0 for level 1
4542 Locality = 3 - Locality;
4543 }
4544
4545 // built the mask value encoding the expected behavior.
4546 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4547 (!IsData << 3) | // IsDataCache bit
4548 (Locality << 1) | // Cache level bits
4549 (unsigned)IsStream; // Stream bit
4550 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4551 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4552 Op.getOperand(1));
4553}
4554
4555// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4556// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4557// (AND X Y) Z which produces a better opt with EmitComparison
4559 SelectionDAG &DAG, const SDLoc DL) {
4560 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4561 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4563 if (LHSConstOp && RHSConst) {
4564 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4565 uint64_t RHSConstant = RHSConst->getZExtValue();
4566 if (isPowerOf2_64(RHSConstant)) {
4567 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4568 LHS =
4569 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4570 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4571 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4572 CC = ISD::SETEQ;
4573 }
4574 }
4575 }
4576}
4577
4578SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4579 SelectionDAG &DAG) const {
4580 EVT VT = Op.getValueType();
4581 if (VT.isScalableVector()) {
4582 SDValue SrcVal = Op.getOperand(0);
4583
4584 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4585 // Break conversion in two with the first part converting to f32 and the
4586 // second using native f32->VT instructions.
4587 SDLoc DL(Op);
4588 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4589 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4590 }
4591
4592 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4593 }
4594
4595 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4596 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4597
4598 bool IsStrict = Op->isStrictFPOpcode();
4599 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4600 EVT Op0VT = Op0.getValueType();
4601 if (VT == MVT::f64) {
4602 // FP16->FP32 extends are legal for v32 and v4f32.
4603 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4604 return Op;
4605 // Split bf16->f64 extends into two fpextends.
4606 if (Op0VT == MVT::bf16 && IsStrict) {
4607 SDValue Ext1 =
4608 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4609 {Op0, Op.getOperand(0)});
4610 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4611 {Ext1, Ext1.getValue(1)});
4612 }
4613 if (Op0VT == MVT::bf16)
4614 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4615 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4616 return SDValue();
4617 }
4618
4619 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4620 return SDValue();
4621}
4622
4623SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4624 SelectionDAG &DAG) const {
4625 EVT VT = Op.getValueType();
4626 bool IsStrict = Op->isStrictFPOpcode();
4627 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4628 EVT SrcVT = SrcVal.getValueType();
4629 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4630
4631 if (VT.isScalableVector()) {
4632 // Let common code split the operation.
4633 if (SrcVT == MVT::nxv8f32)
4634 return Op;
4635
4636 if (VT.getScalarType() != MVT::bf16)
4637 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4638
4639 SDLoc DL(Op);
4640 constexpr EVT I32 = MVT::nxv4i32;
4641 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4642
4643 SDValue NaN;
4644 SDValue Narrow;
4645
4646 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4647 if (Subtarget->hasBF16())
4648 return LowerToPredicatedOp(Op, DAG,
4649 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4650
4651 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4652
4653 // Set the quiet bit.
4654 if (!DAG.isKnownNeverSNaN(SrcVal))
4655 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4656 } else if (SrcVT == MVT::nxv2f64 &&
4657 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4658 // Round to float without introducing rounding errors and try again.
4659 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4660 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4661 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4662
4664 if (IsStrict)
4665 NewOps.push_back(Op.getOperand(0));
4666 NewOps.push_back(Narrow);
4667 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4668 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4669 } else
4670 return SDValue();
4671
4672 if (!Trunc) {
4673 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4674 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4675 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4676 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4677 }
4678
4679 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4680 // 0x80000000.
4681 if (NaN) {
4682 EVT I1 = I32.changeElementType(MVT::i1);
4683 EVT CondVT = VT.changeElementType(MVT::i1);
4684 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4685 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4686 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4687 }
4688
4689 // Now that we have rounded, shift the bits into position.
4690 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4691 return getSVESafeBitCast(VT, Narrow, DAG);
4692 }
4693
4694 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4695 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4696
4697 // Expand cases where the result type is BF16 but we don't have hardware
4698 // instructions to lower it.
4699 if (VT.getScalarType() == MVT::bf16 &&
4700 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4701 Subtarget->hasBF16())) {
4702 SDLoc DL(Op);
4703 SDValue Narrow = SrcVal;
4704 SDValue NaN;
4705 EVT I32 = SrcVT.changeElementType(MVT::i32);
4706 EVT F32 = SrcVT.changeElementType(MVT::f32);
4707 if (SrcVT.getScalarType() == MVT::f32) {
4708 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4709 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4710 if (!NeverSNaN) {
4711 // Set the quiet bit.
4712 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4713 DAG.getConstant(0x400000, DL, I32));
4714 }
4715 } else if (SrcVT.getScalarType() == MVT::f64) {
4716 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4717 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4718 } else {
4719 return SDValue();
4720 }
4721 if (!Trunc) {
4722 SDValue One = DAG.getConstant(1, DL, I32);
4723 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4724 DAG.getShiftAmountConstant(16, I32, DL));
4725 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4726 SDValue RoundingBias =
4727 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4728 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4729 }
4730
4731 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4732 // 0x80000000.
4733 if (NaN) {
4734 SDValue IsNaN = DAG.getSetCC(
4735 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4736 SrcVal, SrcVal, ISD::SETUO);
4737 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4738 }
4739
4740 // Now that we have rounded, shift the bits into position.
4741 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4742 DAG.getShiftAmountConstant(16, I32, DL));
4743 if (VT.isVector()) {
4744 EVT I16 = I32.changeVectorElementType(MVT::i16);
4745 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4746 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4747 }
4748 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4749 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4750 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4751 : Result;
4752 }
4753
4754 if (SrcVT != MVT::f128) {
4755 // Expand cases where the input is a vector bigger than NEON.
4757 return SDValue();
4758
4759 // It's legal except when f128 is involved
4760 return Op;
4761 }
4762
4763 return SDValue();
4764}
4765
4766SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4767 SelectionDAG &DAG) const {
4768 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4769 // Any additional optimization in this function should be recorded
4770 // in the cost tables.
4771 bool IsStrict = Op->isStrictFPOpcode();
4772 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4773 EVT VT = Op.getValueType();
4774
4775 assert(!(IsStrict && VT.isScalableVector()) &&
4776 "Unimplemented SVE support for STRICT_FP_to_INT!");
4777
4778 // f16 conversions are promoted to f32 when full fp16 is not supported.
4779 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4780 InVT.getVectorElementType() == MVT::bf16) {
4781 EVT NewVT = VT.changeElementType(MVT::f32);
4782 SDLoc DL(Op);
4783 if (IsStrict) {
4784 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4785 {Op.getOperand(0), Op.getOperand(1)});
4786 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4787 {Ext.getValue(1), Ext.getValue(0)});
4788 }
4789 return DAG.getNode(
4790 Op.getOpcode(), DL, Op.getValueType(),
4791 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4792 }
4793
4794 if (VT.isScalableVector()) {
4795 if (VT.getVectorElementType() == MVT::i1) {
4796 SDLoc DL(Op);
4797 EVT CvtVT = getPromotedVTForPredicate(VT);
4798 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4799 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4800 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4801 }
4802
4803 // Let common code split the operation.
4804 if (InVT == MVT::nxv8f32)
4805 return Op;
4806
4807 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4808 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4809 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4810 return LowerToPredicatedOp(Op, DAG, Opcode);
4811 }
4812
4813 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4814 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4815 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4816
4817 uint64_t VTSize = VT.getFixedSizeInBits();
4818 uint64_t InVTSize = InVT.getFixedSizeInBits();
4819 if (VTSize < InVTSize) {
4820 SDLoc DL(Op);
4821 if (IsStrict) {
4823 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4824 {Op.getOperand(0), Op.getOperand(1)});
4825 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4826 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4827 }
4828 SDValue Cv =
4829 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4830 Op.getOperand(0));
4831 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4832 }
4833
4834 if (VTSize > InVTSize) {
4835 SDLoc DL(Op);
4836 MVT ExtVT =
4839 if (IsStrict) {
4840 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4841 {Op.getOperand(0), Op.getOperand(1)});
4842 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4843 {Ext.getValue(1), Ext.getValue(0)});
4844 }
4845 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4846 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4847 }
4848
4849 // Use a scalar operation for conversions between single-element vectors of
4850 // the same size.
4851 if (InVT.getVectorNumElements() == 1) {
4852 SDLoc DL(Op);
4853 SDValue Extract = DAG.getNode(
4855 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4856 EVT ScalarVT = VT.getScalarType();
4857 if (IsStrict)
4858 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4859 {Op.getOperand(0), Extract});
4860 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4861 }
4862
4863 // Type changing conversions are illegal.
4864 return Op;
4865}
4866
4867SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4868 SelectionDAG &DAG) const {
4869 bool IsStrict = Op->isStrictFPOpcode();
4870 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4871
4872 if (SrcVal.getValueType().isVector())
4873 return LowerVectorFP_TO_INT(Op, DAG);
4874
4875 // f16 conversions are promoted to f32 when full fp16 is not supported.
4876 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4877 SrcVal.getValueType() == MVT::bf16) {
4878 SDLoc DL(Op);
4879 if (IsStrict) {
4880 SDValue Ext =
4881 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4882 {Op.getOperand(0), SrcVal});
4883 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4884 {Ext.getValue(1), Ext.getValue(0)});
4885 }
4886 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4887 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4888 }
4889
4890 if (SrcVal.getValueType() != MVT::f128) {
4891 // It's legal except when f128 is involved
4892 return Op;
4893 }
4894
4895 return SDValue();
4896}
4897
4898SDValue
4899AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4900 SelectionDAG &DAG) const {
4901 // AArch64 FP-to-int conversions saturate to the destination element size, so
4902 // we can lower common saturating conversions to simple instructions.
4903 SDValue SrcVal = Op.getOperand(0);
4904 EVT SrcVT = SrcVal.getValueType();
4905 EVT DstVT = Op.getValueType();
4906 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4907
4908 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4909 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4910 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4911 assert(SatWidth <= DstElementWidth &&
4912 "Saturation width cannot exceed result width");
4913
4914 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4915 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4916 // types, so this is hard to reach.
4917 if (DstVT.isScalableVector())
4918 return SDValue();
4919
4920 EVT SrcElementVT = SrcVT.getVectorElementType();
4921
4922 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4923 SDLoc DL(Op);
4924 SDValue SrcVal2;
4925 if ((SrcElementVT == MVT::f16 &&
4926 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4927 SrcElementVT == MVT::bf16) {
4928 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4929 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4930 // If we are extending to a v8f32, split into two v4f32 to produce legal
4931 // types.
4932 if (F32VT.getSizeInBits() > 128) {
4933 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4934 F32VT = F32VT.getHalfNumVectorElementsVT();
4935 }
4936 SrcVT = F32VT;
4937 SrcElementVT = MVT::f32;
4938 SrcElementWidth = 32;
4939 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4940 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4941 return SDValue();
4942
4943 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4944 // width and produce a fcvtzu.
4945 if (SatWidth == 64 && SrcElementWidth < 64) {
4946 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4947 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4948 SrcVT = F64VT;
4949 SrcElementVT = MVT::f64;
4950 SrcElementWidth = 64;
4951 }
4952 // Cases that we can emit directly.
4953 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4954 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4955 DAG.getValueType(DstVT.getScalarType()));
4956 if (SrcVal2) {
4957 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4958 DAG.getValueType(DstVT.getScalarType()));
4959 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4960 }
4961 return Res;
4962 }
4963
4964 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4965 // result. This is only valid if the legal cvt is larger than the saturate
4966 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4967 // (at least until sqxtn is selected).
4968 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4969 return SDValue();
4970
4971 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4972 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4973 DAG.getValueType(IntVT.getScalarType()));
4974 SDValue NativeCvt2 =
4975 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4976 DAG.getValueType(IntVT.getScalarType()))
4977 : SDValue();
4978 SDValue Sat, Sat2;
4979 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4980 SDValue MinC = DAG.getConstant(
4981 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4982 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4983 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4984 SDValue MaxC = DAG.getConstant(
4985 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4986 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4987 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4988 } else {
4989 SDValue MinC = DAG.getConstant(
4990 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4991 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4992 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4993 }
4994
4995 if (SrcVal2)
4996 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4998 Sat, Sat2);
4999
5000 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5001}
5002
5003SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5004 SelectionDAG &DAG) const {
5005 // AArch64 FP-to-int conversions saturate to the destination register size, so
5006 // we can lower common saturating conversions to simple instructions.
5007 SDValue SrcVal = Op.getOperand(0);
5008 EVT SrcVT = SrcVal.getValueType();
5009
5010 if (SrcVT.isVector())
5011 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5012
5013 EVT DstVT = Op.getValueType();
5014 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5015 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5016 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5017 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5018
5019 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5020 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5021 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5022 SrcVT = MVT::f32;
5023 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5024 SrcVT != MVT::bf16)
5025 return SDValue();
5026
5027 SDLoc DL(Op);
5028 // Cases that we can emit directly.
5029 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5030 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5031 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5032 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5033 DAG.getValueType(DstVT));
5034
5035 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5036 // result. This is only valid if the legal cvt is larger than the saturate
5037 // width.
5038 if (DstWidth < SatWidth)
5039 return SDValue();
5040
5041 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5042 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5043 SDValue CVTf32 =
5044 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5045 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5046 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5047 DAG.getValueType(SatVT));
5048 }
5049 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5050 return DAG.getBitcast(DstVT, CVTf32);
5051 }
5052
5053 SDValue NativeCvt =
5054 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5055 SDValue Sat;
5056 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5057 SDValue MinC = DAG.getConstant(
5058 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5059 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5060 SDValue MaxC = DAG.getConstant(
5061 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5062 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5063 } else {
5064 SDValue MinC = DAG.getConstant(
5065 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5066 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5067 }
5068
5069 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5070}
5071
5072SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5073 SelectionDAG &DAG) const {
5074 EVT VT = Op.getValueType();
5075 SDValue Src = Op.getOperand(0);
5076 SDLoc DL(Op);
5077
5078 assert(VT.isVector() && "Expected vector type");
5079
5080 EVT CastVT =
5081 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5082
5083 // Round the floating-point value into a floating-point register with the
5084 // current rounding mode.
5085 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5086
5087 // Truncate the rounded floating point to an integer.
5088 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5090}
5091
5092SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5093 SelectionDAG &DAG) const {
5094 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5095 // Any additional optimization in this function should be recorded
5096 // in the cost tables.
5097 bool IsStrict = Op->isStrictFPOpcode();
5098 EVT VT = Op.getValueType();
5099 SDLoc DL(Op);
5100 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5101 EVT InVT = In.getValueType();
5102 unsigned Opc = Op.getOpcode();
5103 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5104
5105 assert(!(IsStrict && VT.isScalableVector()) &&
5106 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5107
5108 // NOTE: i1->bf16 does not require promotion to f32.
5109 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5110 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5111 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5112 : DAG.getConstantFP(1.0, DL, VT);
5113 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5114 }
5115
5116 // Promote bf16 conversions to f32.
5117 if (VT.getVectorElementType() == MVT::bf16) {
5118 EVT F32 = VT.changeElementType(MVT::f32);
5119 if (IsStrict) {
5120 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5121 {Op.getOperand(0), In});
5122 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5123 {Op.getValueType(), MVT::Other},
5124 {Val.getValue(1), Val.getValue(0),
5125 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5126 }
5127 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5128 DAG.getNode(Op.getOpcode(), DL, F32, In),
5129 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5130 }
5131
5132 if (VT.isScalableVector()) {
5133 // Let common code split the operation.
5134 if (VT == MVT::nxv8f32)
5135 return Op;
5136
5137 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5138 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5139 return LowerToPredicatedOp(Op, DAG, Opcode);
5140 }
5141
5142 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5143 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5144 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5145
5146 uint64_t VTSize = VT.getFixedSizeInBits();
5147 uint64_t InVTSize = InVT.getFixedSizeInBits();
5148 if (VTSize < InVTSize) {
5149 // AArch64 doesn't have a direct vector instruction to convert
5150 // fixed point to floating point AND narrow it at the same time.
5151 // Additional rounding when the target is f32/f64 causes double
5152 // rounding issues. Conversion to f16 is fine due to narrow width.
5153 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5154 bool IsTargetf16 = false;
5155 if (Op.hasOneUse() &&
5156 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5157 // Some vector types are split during legalization into half, followed by
5158 // concatenation, followed by rounding to the original vector type. If we
5159 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5160 SDNode *U = *Op->user_begin();
5161 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5162 EVT TmpVT = U->user_begin()->getValueType(0);
5163 if (TmpVT.getScalarType() == MVT::f16)
5164 IsTargetf16 = true;
5165 }
5166 }
5167
5168 if (IsTargetf32 && !IsTargetf16) {
5169 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5170 }
5171
5172 MVT CastVT =
5174 InVT.getVectorNumElements());
5175 if (IsStrict) {
5176 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5177 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5178 {In.getValue(1), In.getValue(0),
5179 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5180 }
5181 In = DAG.getNode(Opc, DL, CastVT, In);
5182 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5183 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5184 }
5185
5186 if (VTSize > InVTSize) {
5187 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5188 EVT CastVT = VT.changeVectorElementTypeToInteger();
5189 In = DAG.getNode(CastOpc, DL, CastVT, In);
5190 if (IsStrict)
5191 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5192 return DAG.getNode(Opc, DL, VT, In);
5193 }
5194
5195 // Use a scalar operation for conversions between single-element vectors of
5196 // the same size.
5197 if (VT.getVectorNumElements() == 1) {
5198 SDValue Extract =
5200 DAG.getConstant(0, DL, MVT::i64));
5201 EVT ScalarVT = VT.getScalarType();
5202 if (IsStrict)
5203 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5204 {Op.getOperand(0), Extract});
5205 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5206 }
5207
5208 return Op;
5209}
5210
5211SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5212 SelectionDAG &DAG) const {
5213 if (Op.getValueType().isVector())
5214 return LowerVectorINT_TO_FP(Op, DAG);
5215
5216 bool IsStrict = Op->isStrictFPOpcode();
5217 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5218
5219 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5220 Op->getOpcode() == ISD::SINT_TO_FP;
5221
5222 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5223 SDLoc DL(Op);
5224 if (IsStrict) {
5225 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5226 {Op.getOperand(0), SrcVal});
5227 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5228 {Op.getValueType(), MVT::Other},
5229 {Val.getValue(1), Val.getValue(0),
5230 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5231 }
5232 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5233 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5234 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5235 };
5236
5237 if (Op.getValueType() == MVT::bf16) {
5238 unsigned MaxWidth = IsSigned
5239 ? DAG.ComputeMaxSignificantBits(SrcVal)
5240 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5241 // bf16 conversions are promoted to f32 when converting from i16.
5242 if (MaxWidth <= 24) {
5243 return IntToFpViaPromotion(MVT::f32);
5244 }
5245
5246 // bf16 conversions are promoted to f64 when converting from i32.
5247 if (MaxWidth <= 53) {
5248 return IntToFpViaPromotion(MVT::f64);
5249 }
5250
5251 // We need to be careful about i64 -> bf16.
5252 // Consider an i32 22216703.
5253 // This number cannot be represented exactly as an f32 and so a itofp will
5254 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5255 // However, the correct bf16 was supposed to be 22151168.0
5256 // We need to use sticky rounding to get this correct.
5257 if (SrcVal.getValueType() == MVT::i64) {
5258 SDLoc DL(Op);
5259 // This algorithm is equivalent to the following:
5260 // uint64_t SrcHi = SrcVal & ~0xfffull;
5261 // uint64_t SrcLo = SrcVal & 0xfffull;
5262 // uint64_t Highest = SrcVal >> 53;
5263 // bool HasHighest = Highest != 0;
5264 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5265 // double Rounded = static_cast<double>(ToRound);
5266 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5267 // uint64_t HasLo = SrcLo != 0;
5268 // bool NeedsAdjustment = HasHighest & HasLo;
5269 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5270 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5271 // return static_cast<__bf16>(Adjusted);
5272 //
5273 // Essentially, what happens is that SrcVal either fits perfectly in a
5274 // double-precision value or it is too big. If it is sufficiently small,
5275 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5276 // ensure that u64 -> double has no rounding error by only using the 52
5277 // MSB of the input. The low order bits will get merged into a sticky bit
5278 // which will avoid issues incurred by double rounding.
5279
5280 // Signed conversion is more or less like so:
5281 // copysign((__bf16)abs(SrcVal), SrcVal)
5282 SDValue SignBit;
5283 if (IsSigned) {
5284 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5285 DAG.getConstant(1ull << 63, DL, MVT::i64));
5286 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5287 }
5288 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5289 DAG.getConstant(~0xfffull, DL, MVT::i64));
5290 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5291 DAG.getConstant(0xfffull, DL, MVT::i64));
5293 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5294 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5295 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5296 SDValue ToRound =
5297 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5298 SDValue Rounded =
5299 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5300 {Op.getOperand(0), ToRound})
5301 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5302
5303 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5304 if (SignBit) {
5305 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5306 }
5307
5308 SDValue HasHighest = DAG.getSetCC(
5309 DL,
5310 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5311 Highest, Zero64, ISD::SETNE);
5312
5313 SDValue HasLo = DAG.getSetCC(
5314 DL,
5315 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5316 SrcLo, Zero64, ISD::SETNE);
5317
5318 SDValue NeedsAdjustment =
5319 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5320 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5321
5322 SDValue AdjustedBits =
5323 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5324 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5325 return IsStrict
5326 ? DAG.getNode(
5328 {Op.getValueType(), MVT::Other},
5329 {Rounded.getValue(1), Adjusted,
5330 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5331 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5332 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5333 }
5334 }
5335
5336 // f16 conversions are promoted to f32 when full fp16 is not supported.
5337 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5338 return IntToFpViaPromotion(MVT::f32);
5339 }
5340
5341 // i128 conversions are libcalls.
5342 if (SrcVal.getValueType() == MVT::i128)
5343 return SDValue();
5344
5345 // Other conversions are legal, unless it's to the completely software-based
5346 // fp128.
5347 if (Op.getValueType() != MVT::f128)
5348 return Op;
5349 return SDValue();
5350}
5351
5352SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5353 SelectionDAG &DAG) const {
5354 // For iOS, we want to call an alternative entry point: __sincos_stret,
5355 // which returns the values in two S / D registers.
5356 SDLoc DL(Op);
5357 SDValue Arg = Op.getOperand(0);
5358 EVT ArgVT = Arg.getValueType();
5359 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5360
5362 Args.emplace_back(Arg, ArgTy);
5363
5364 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5365 : RTLIB::SINCOS_STRET_F32;
5366 const char *LibcallName = getLibcallName(LC);
5367 SDValue Callee =
5368 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5369
5370 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5371 TargetLowering::CallLoweringInfo CLI(DAG);
5373 CLI.setDebugLoc(DL)
5374 .setChain(DAG.getEntryNode())
5375 .setLibCallee(CC, RetTy, Callee, std::move(Args));
5376
5377 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5378 return CallResult.first;
5379}
5380
5381static MVT getSVEContainerType(EVT ContentTy);
5382
5383SDValue
5384AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5385 SelectionDAG &DAG) const {
5386 SDLoc DL(Op);
5387 uint64_t EltSize = Op.getConstantOperandVal(2);
5388 EVT VT = Op.getValueType();
5389 switch (EltSize) {
5390 case 1:
5391 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5392 return SDValue();
5393 break;
5394 case 2:
5395 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5396 return SDValue();
5397 break;
5398 case 4:
5399 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5400 return SDValue();
5401 break;
5402 case 8:
5403 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5404 return SDValue();
5405 break;
5406 default:
5407 // Other element sizes are incompatible with whilewr/rw, so expand instead
5408 return SDValue();
5409 }
5410
5411 SDValue PtrA = Op.getOperand(0);
5412 SDValue PtrB = Op.getOperand(1);
5413
5414 if (VT.isScalableVT())
5415 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
5416
5417 // We can use the SVE whilewr/whilerw instruction to lower this
5418 // intrinsic by creating the appropriate sequence of scalable vector
5419 // operations and then extracting a fixed-width subvector from the scalable
5420 // vector. Scalable vector variants are already legal.
5421 EVT ContainerVT =
5423 VT.getVectorNumElements(), true);
5424 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5425
5426 SDValue Mask =
5427 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
5428 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5429 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5430 DAG.getVectorIdxConstant(0, DL));
5431}
5432
5433SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5434 SelectionDAG &DAG) const {
5435 EVT OpVT = Op.getValueType();
5436 EVT ArgVT = Op.getOperand(0).getValueType();
5437
5439 return LowerFixedLengthBitcastToSVE(Op, DAG);
5440
5441 if (OpVT.isScalableVector()) {
5442 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5443
5444 // Handle type legalisation first.
5445 if (!isTypeLegal(ArgVT)) {
5446 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5447 "Expected int->fp bitcast!");
5448
5449 // Bitcasting between unpacked vector types of different element counts is
5450 // not a NOP because the live elements are laid out differently.
5451 // 01234567
5452 // e.g. nxv2i32 = XX??XX??
5453 // nxv4f16 = X?X?X?X?
5454 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5455 return SDValue();
5456
5457 SDValue ExtResult =
5458 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5459 Op.getOperand(0));
5460 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5461 }
5462
5463 // Bitcasts between legal types with the same element count are legal.
5464 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5465 return Op;
5466
5467 // getSVESafeBitCast does not support casting between unpacked types.
5468 if (!isPackedVectorType(OpVT, DAG))
5469 return SDValue();
5470
5471 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5472 }
5473
5474 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5475 return SDValue();
5476
5477 // Bitcasts between f16 and bf16 are legal.
5478 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5479 return Op;
5480
5481 assert(ArgVT == MVT::i16);
5482 SDLoc DL(Op);
5483
5484 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5485 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5486 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5487}
5488
5489// Returns lane if Op extracts from a two-element vector and lane is constant
5490// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5491static std::optional<uint64_t>
5493 SDNode *OpNode = Op.getNode();
5494 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5495 return std::nullopt;
5496
5497 EVT VT = OpNode->getOperand(0).getValueType();
5499 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5500 return std::nullopt;
5501
5502 return C->getZExtValue();
5503}
5504
5506 bool isSigned) {
5507 EVT VT = N.getValueType();
5508
5509 if (N.getOpcode() != ISD::BUILD_VECTOR)
5510 return false;
5511
5512 for (const SDValue &Elt : N->op_values()) {
5514 unsigned EltSize = VT.getScalarSizeInBits();
5515 unsigned HalfSize = EltSize / 2;
5516 if (isSigned) {
5517 if (!isIntN(HalfSize, C->getSExtValue()))
5518 return false;
5519 } else {
5520 if (!isUIntN(HalfSize, C->getZExtValue()))
5521 return false;
5522 }
5523 continue;
5524 }
5525 return false;
5526 }
5527
5528 return true;
5529}
5530
5532 EVT VT = N.getValueType();
5533 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5534 EVT HalfVT = EVT::getVectorVT(
5535 *DAG.getContext(),
5538 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5539}
5540
5542 return N.getOpcode() == ISD::SIGN_EXTEND ||
5543 N.getOpcode() == ISD::ANY_EXTEND ||
5544 isExtendedBUILD_VECTOR(N, DAG, true);
5545}
5546
5548 return N.getOpcode() == ISD::ZERO_EXTEND ||
5549 N.getOpcode() == ISD::ANY_EXTEND ||
5550 isExtendedBUILD_VECTOR(N, DAG, false);
5551}
5552
5554 unsigned Opcode = N.getOpcode();
5555 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5556 SDValue N0 = N.getOperand(0);
5557 SDValue N1 = N.getOperand(1);
5558 return N0->hasOneUse() && N1->hasOneUse() &&
5559 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5560 }
5561 return false;
5562}
5563
5565 unsigned Opcode = N.getOpcode();
5566 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5567 SDValue N0 = N.getOperand(0);
5568 SDValue N1 = N.getOperand(1);
5569 return N0->hasOneUse() && N1->hasOneUse() &&
5570 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5571 }
5572 return false;
5573}
5574
5575SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5576 SelectionDAG &DAG) const {
5577 // The rounding mode is in bits 23:22 of the FPSCR.
5578 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5579 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5580 // so that the shift + and get folded into a bitfield extract.
5581 SDLoc DL(Op);
5582
5583 SDValue Chain = Op.getOperand(0);
5584 SDValue FPCR_64 = DAG.getNode(
5585 ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5586 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
5587 Chain = FPCR_64.getValue(1);
5588 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5589 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5590 DAG.getConstant(1U << 22, DL, MVT::i32));
5591 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5592 DAG.getConstant(22, DL, MVT::i32));
5593 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5594 DAG.getConstant(3, DL, MVT::i32));
5595 return DAG.getMergeValues({AND, Chain}, DL);
5596}
5597
5598SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5599 SelectionDAG &DAG) const {
5600 SDLoc DL(Op);
5601 SDValue Chain = Op->getOperand(0);
5602 SDValue RMValue = Op->getOperand(1);
5603
5604 // The rounding mode is in bits 23:22 of the FPCR.
5605 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5606 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5607 // ((arg - 1) & 3) << 22).
5608 //
5609 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5610 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5611 // generated llvm.set.rounding to ensure this condition.
5612
5613 // Calculate new value of FPCR[23:22].
5614 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5615 DAG.getConstant(1, DL, MVT::i32));
5616 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5617 DAG.getConstant(0x3, DL, MVT::i32));
5618 RMValue =
5619 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5620 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5621 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5622
5623 // Get current value of FPCR.
5624 SDValue Ops[] = {
5625 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5626 SDValue FPCR =
5627 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5628 Chain = FPCR.getValue(1);
5629 FPCR = FPCR.getValue(0);
5630
5631 // Put new rounding mode into FPSCR[23:22].
5632 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5633 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5634 DAG.getConstant(RMMask, DL, MVT::i64));
5635 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5636 SDValue Ops2[] = {
5637 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5638 FPCR};
5639 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5640}
5641
5642SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5643 SelectionDAG &DAG) const {
5644 SDLoc DL(Op);
5645 SDValue Chain = Op->getOperand(0);
5646
5647 // Get current value of FPCR.
5648 SDValue Ops[] = {
5649 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5650 SDValue FPCR =
5651 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5652 Chain = FPCR.getValue(1);
5653 FPCR = FPCR.getValue(0);
5654
5655 // Truncate FPCR to 32 bits.
5656 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5657
5658 return DAG.getMergeValues({Result, Chain}, DL);
5659}
5660
5661SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5662 SelectionDAG &DAG) const {
5663 SDLoc DL(Op);
5664 SDValue Chain = Op->getOperand(0);
5665 SDValue Mode = Op->getOperand(1);
5666
5667 // Extend the specified value to 64 bits.
5668 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5669
5670 // Set new value of FPCR.
5671 SDValue Ops2[] = {
5672 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5673 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5674}
5675
5676SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5677 SelectionDAG &DAG) const {
5678 SDLoc DL(Op);
5679 SDValue Chain = Op->getOperand(0);
5680
5681 // Get current value of FPCR.
5682 SDValue Ops[] = {
5683 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5684 SDValue FPCR =
5685 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5686 Chain = FPCR.getValue(1);
5687 FPCR = FPCR.getValue(0);
5688
5689 // Clear bits that are not reserved.
5690 SDValue FPSCRMasked = DAG.getNode(
5691 ISD::AND, DL, MVT::i64, FPCR,
5693
5694 // Set new value of FPCR.
5695 SDValue Ops2[] = {Chain,
5696 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5697 FPSCRMasked};
5698 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5699}
5700
5701static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5702 SDLoc DL, bool &IsMLA) {
5703 bool IsN0SExt = isSignExtended(N0, DAG);
5704 bool IsN1SExt = isSignExtended(N1, DAG);
5705 if (IsN0SExt && IsN1SExt)
5706 return AArch64ISD::SMULL;
5707
5708 bool IsN0ZExt = isZeroExtended(N0, DAG);
5709 bool IsN1ZExt = isZeroExtended(N1, DAG);
5710
5711 if (IsN0ZExt && IsN1ZExt)
5712 return AArch64ISD::UMULL;
5713
5714 // Select UMULL if we can replace the other operand with an extend.
5715 EVT VT = N0.getValueType();
5716 unsigned EltSize = VT.getScalarSizeInBits();
5717 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5718 if (IsN0ZExt || IsN1ZExt) {
5719 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5720 return AArch64ISD::UMULL;
5721 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5722 DAG.MaskedValueIsZero(N1, Mask)) {
5723 // For v2i64 we look more aggressively at both operands being zero, to avoid
5724 // scalarization.
5725 return AArch64ISD::UMULL;
5726 }
5727
5728 if (IsN0SExt || IsN1SExt) {
5729 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5730 return AArch64ISD::SMULL;
5731 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5732 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5733 return AArch64ISD::SMULL;
5734 }
5735
5736 if (!IsN1SExt && !IsN1ZExt)
5737 return 0;
5738
5739 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5740 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5741 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5742 IsMLA = true;
5743 return AArch64ISD::SMULL;
5744 }
5745 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5746 IsMLA = true;
5747 return AArch64ISD::UMULL;
5748 }
5749 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5750 std::swap(N0, N1);
5751 IsMLA = true;
5752 return AArch64ISD::UMULL;
5753 }
5754 return 0;
5755}
5756
5757SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5758 EVT VT = Op.getValueType();
5759
5760 bool OverrideNEON = !Subtarget->isNeonAvailable();
5761 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5762 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5763
5764 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5765 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5766 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5767 "unexpected type for custom-lowering ISD::MUL");
5768 SDValue N0 = Op.getOperand(0);
5769 SDValue N1 = Op.getOperand(1);
5770 bool isMLA = false;
5771 EVT OVT = VT;
5772 if (VT.is64BitVector()) {
5773 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5774 isNullConstant(N0.getOperand(1)) &&
5776 isNullConstant(N1.getOperand(1))) {
5777 N0 = N0.getOperand(0);
5778 N1 = N1.getOperand(0);
5779 VT = N0.getValueType();
5780 } else {
5781 if (VT == MVT::v1i64) {
5782 if (Subtarget->hasSVE())
5783 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5784 // Fall through to expand this. It is not legal.
5785 return SDValue();
5786 } else
5787 // Other vector multiplications are legal.
5788 return Op;
5789 }
5790 }
5791
5792 SDLoc DL(Op);
5793 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5794
5795 if (!NewOpc) {
5796 if (VT.getVectorElementType() == MVT::i64) {
5797 // If SVE is available then i64 vector multiplications can also be made
5798 // legal.
5799 if (Subtarget->hasSVE())
5800 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5801 // Fall through to expand this. It is not legal.
5802 return SDValue();
5803 } else
5804 // Other vector multiplications are legal.
5805 return Op;
5806 }
5807
5808 // Legalize to a S/UMULL instruction
5809 SDValue Op0;
5810 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5811 if (!isMLA) {
5812 Op0 = skipExtensionForVectorMULL(N0, DAG);
5814 Op1.getValueType().is64BitVector() &&
5815 "unexpected types for extended operands to VMULL");
5816 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5817 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5818 DAG.getConstant(0, DL, MVT::i64));
5819 }
5820 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5821 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5822 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5825 EVT Op1VT = Op1.getValueType();
5826 return DAG.getNode(
5828 DAG.getNode(N0.getOpcode(), DL, VT,
5829 DAG.getNode(NewOpc, DL, VT,
5830 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5831 DAG.getNode(NewOpc, DL, VT,
5832 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5833 DAG.getConstant(0, DL, MVT::i64));
5834}
5835
5836static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5837 int Pattern) {
5838 if (Pattern == AArch64SVEPredPattern::all)
5839 return DAG.getConstant(1, DL, VT);
5840 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5841 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5842}
5843
5845 bool IsSigned, bool IsEqual) {
5846 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5847 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5848
5849 if (!N->getValueType(0).isScalableVector() ||
5850 !isa<ConstantSDNode>(N->getOperand(Op1)))
5851 return SDValue();
5852
5853 SDLoc DL(N);
5854 APInt Y = N->getConstantOperandAPInt(Op1);
5855
5856 // When the second operand is the maximum value, comparisons that include
5857 // equality can never fail and thus we can return an all active predicate.
5858 if (IsEqual)
5859 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5860 return DAG.getConstant(1, DL, N->getValueType(0));
5861
5862 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5863 return SDValue();
5864
5865 APInt X = N->getConstantOperandAPInt(Op0);
5866
5867 bool Overflow;
5868 APInt NumActiveElems =
5869 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5870
5871 if (Overflow)
5872 return SDValue();
5873
5874 if (IsEqual) {
5875 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5876 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5877 : NumActiveElems.uadd_ov(One, Overflow);
5878 if (Overflow)
5879 return SDValue();
5880 }
5881
5882 std::optional<unsigned> PredPattern =
5884 unsigned MinSVEVectorSize = std::max(
5886 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5887 if (PredPattern != std::nullopt &&
5888 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5889 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5890
5891 return SDValue();
5892}
5893
5894// Returns a safe bitcast between two scalable vector predicates, where
5895// any newly created lanes from a widening bitcast are defined as zero.
5897 SDLoc DL(Op);
5898 EVT InVT = Op.getValueType();
5899
5900 assert(InVT.getVectorElementType() == MVT::i1 &&
5901 VT.getVectorElementType() == MVT::i1 &&
5902 "Expected a predicate-to-predicate bitcast");
5904 InVT.isScalableVector() &&
5905 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5906 "Only expect to cast between legal scalable predicate types!");
5907
5908 // Return the operand if the cast isn't changing type,
5909 if (InVT == VT)
5910 return Op;
5911
5912 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5913 // than VT. This will increase the chances of removing casts that introduce
5914 // new lanes, which have to be explicitly zero'd.
5915 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5916 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5917 Op.getOperand(1).getValueType().bitsGT(VT))
5918 Op = Op.getOperand(1);
5919
5920 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5921
5922 // We only have to zero the lanes if new lanes are being defined, e.g. when
5923 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5924 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5925 // we can return here.
5926 if (InVT.bitsGT(VT))
5927 return Reinterpret;
5928
5929 // Check if the other lanes are already known to be zeroed by
5930 // construction.
5932 return Reinterpret;
5933
5934 // Zero the newly introduced lanes.
5935 SDValue Mask = DAG.getConstant(1, DL, InVT);
5936 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5937 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5938}
5939
5940SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5941 SDValue Chain, SDLoc DL,
5942 EVT VT) const {
5943 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5946 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5947 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5948 TargetLowering::CallLoweringInfo CLI(DAG);
5950 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5951 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5952 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5953 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5954 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5955 Mask);
5956}
5957
5958// Lower an SME LDR/STR ZA intrinsic
5959// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5960// folded into the instruction
5961// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5962// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5963// and tile slice registers
5964// ldr(%tileslice, %ptr, %vecnum)
5965// ->
5966// %svl = rdsvl
5967// %ptr2 = %ptr + %svl * %vecnum
5968// %tileslice2 = %tileslice + %vecnum
5969// ldr [%tileslice2, 0], [%ptr2, 0]
5970// Case 3: If the vecnum is an immediate out of range, then the same is done as
5971// case 2, but the base and slice registers are modified by the greatest
5972// multiple of 15 lower than the vecnum and the remainder is folded into the
5973// instruction. This means that successive loads and stores that are offset from
5974// each other can share the same base and slice register updates.
5975// ldr(%tileslice, %ptr, 22)
5976// ldr(%tileslice, %ptr, 23)
5977// ->
5978// %svl = rdsvl
5979// %ptr2 = %ptr + %svl * 15
5980// %tileslice2 = %tileslice + 15
5981// ldr [%tileslice2, 7], [%ptr2, 7]
5982// ldr [%tileslice2, 8], [%ptr2, 8]
5983// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5984// operand and the immediate can be folded into the instruction, like case 2.
5985// ldr(%tileslice, %ptr, %vecnum + 7)
5986// ldr(%tileslice, %ptr, %vecnum + 8)
5987// ->
5988// %svl = rdsvl
5989// %ptr2 = %ptr + %svl * %vecnum
5990// %tileslice2 = %tileslice + %vecnum
5991// ldr [%tileslice2, 7], [%ptr2, 7]
5992// ldr [%tileslice2, 8], [%ptr2, 8]
5993// Case 5: The vecnum being an add of an immediate out of range is also handled,
5994// in which case the same remainder logic as case 3 is used.
5996 SDLoc DL(N);
5997
5998 SDValue TileSlice = N->getOperand(2);
5999 SDValue Base = N->getOperand(3);
6000 SDValue VecNum = N->getOperand(4);
6001 int32_t ConstAddend = 0;
6002 SDValue VarAddend = VecNum;
6003
6004 // If the vnum is an add of an immediate, we can fold it into the instruction
6005 if (VecNum.getOpcode() == ISD::ADD &&
6006 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6007 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6008 VarAddend = VecNum.getOperand(0);
6009 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6010 ConstAddend = ImmNode->getSExtValue();
6011 VarAddend = SDValue();
6012 }
6013
6014 int32_t ImmAddend = ConstAddend % 16;
6015 if (int32_t C = (ConstAddend - ImmAddend)) {
6016 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6017 VarAddend = VarAddend
6018 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6019 : CVal;
6020 }
6021
6022 if (VarAddend) {
6023 // Get the vector length that will be multiplied by vnum
6024 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6025 DAG.getConstant(1, DL, MVT::i32));
6026
6027 // Multiply SVL and vnum then add it to the base
6028 SDValue Mul = DAG.getNode(
6029 ISD::MUL, DL, MVT::i64,
6030 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6031 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6032 // Just add vnum to the tileslice
6033 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6034 }
6035
6036 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6037 DL, MVT::Other,
6038 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6039 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6040}
6041
6043 SDLoc DL(Op);
6044 SDValue ID =
6045 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6046
6047 auto Op1 = Op.getOperand(1);
6048 auto Op2 = Op.getOperand(2);
6049 auto Mask = Op.getOperand(3);
6050
6051 EVT Op1VT = Op1.getValueType();
6052 EVT Op2VT = Op2.getValueType();
6053 EVT ResVT = Op.getValueType();
6054
6055 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6056 Op1VT.getVectorElementType() == MVT::i16) &&
6057 "Expected 8-bit or 16-bit characters.");
6058
6059 // Scalable vector type used to wrap operands.
6060 // A single container is enough for both operands because ultimately the
6061 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6062 EVT OpContainerVT = Op1VT.isScalableVector()
6063 ? Op1VT
6065
6066 if (Op2VT.is128BitVector()) {
6067 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6068 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6069 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6070 if (ResVT.isScalableVector())
6071 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6072 DAG.getTargetConstant(0, DL, MVT::i64));
6073 } else {
6074 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6075 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6076 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6077 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6078 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6079 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6080 DAG.getConstant(0, DL, MVT::i64));
6081 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6082 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6083 }
6084
6085 // If the result is scalable, we just need to carry out the MATCH.
6086 if (ResVT.isScalableVector())
6087 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6088
6089 // If the result is fixed, we can still use MATCH but we need to wrap the
6090 // first operand and the mask in scalable vectors before doing so.
6091
6092 // Wrap the operands.
6093 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6094 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6095 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6096
6097 // Carry out the match.
6098 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6099 ID, Mask, Op1, Op2);
6100
6101 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6102 // (v16i8/v8i8).
6103 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6104 Match = convertFromScalableVector(DAG, Op1VT, Match);
6105 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6106}
6107
6108SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6109 SelectionDAG &DAG) const {
6110 unsigned IntNo = Op.getConstantOperandVal(1);
6111 SDLoc DL(Op);
6112 switch (IntNo) {
6113 default:
6114 return SDValue(); // Don't custom lower most intrinsics.
6115 case Intrinsic::aarch64_prefetch: {
6116 SDValue Chain = Op.getOperand(0);
6117 SDValue Addr = Op.getOperand(2);
6118
6119 unsigned IsWrite = Op.getConstantOperandVal(3);
6120 unsigned Locality = Op.getConstantOperandVal(4);
6121 unsigned IsStream = Op.getConstantOperandVal(5);
6122 unsigned IsData = Op.getConstantOperandVal(6);
6123 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6124 (!IsData << 3) | // IsDataCache bit
6125 (Locality << 1) | // Cache level bits
6126 (unsigned)IsStream; // Stream bit
6127
6128 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6129 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6130 }
6131 case Intrinsic::aarch64_sme_str:
6132 case Intrinsic::aarch64_sme_ldr: {
6133 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6134 }
6135 case Intrinsic::aarch64_sme_za_enable:
6136 return DAG.getNode(
6137 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6138 Op->getOperand(0), // Chain
6139 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6140 case Intrinsic::aarch64_sme_za_disable:
6141 return DAG.getNode(
6142 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6143 Op->getOperand(0), // Chain
6144 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6145 }
6146}
6147
6148SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6149 SelectionDAG &DAG) const {
6150 unsigned IntNo = Op.getConstantOperandVal(1);
6151 SDLoc DL(Op);
6152 switch (IntNo) {
6153 default:
6154 return SDValue(); // Don't custom lower most intrinsics.
6155 case Intrinsic::aarch64_mops_memset_tag: {
6156 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6157 SDValue Chain = Node->getChain();
6158 SDValue Dst = Op.getOperand(2);
6159 SDValue Val = Op.getOperand(3);
6160 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6161 SDValue Size = Op.getOperand(4);
6162 auto Alignment = Node->getMemOperand()->getAlign();
6163 bool IsVol = Node->isVolatile();
6164 auto DstPtrInfo = Node->getPointerInfo();
6165
6166 const auto &SDI =
6167 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6168 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6169 Chain, Dst, Val, Size, Alignment, IsVol,
6170 DstPtrInfo, MachinePointerInfo{});
6171
6172 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6173 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6174 // LowerOperationWrapper will complain that the number of results has
6175 // changed.
6176 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6177 }
6178 }
6179}
6180
6181SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6182 SelectionDAG &DAG) const {
6183 unsigned IntNo = Op.getConstantOperandVal(0);
6184 SDLoc DL(Op);
6185 switch (IntNo) {
6186 default: return SDValue(); // Don't custom lower most intrinsics.
6187 case Intrinsic::thread_pointer: {
6188 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6189 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6190 }
6191 case Intrinsic::aarch64_sve_whilewr_b:
6192 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6193 Op.getOperand(1), Op.getOperand(2),
6194 DAG.getConstant(1, DL, MVT::i64));
6195 case Intrinsic::aarch64_sve_whilewr_h:
6196 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6197 Op.getOperand(1), Op.getOperand(2),
6198 DAG.getConstant(2, DL, MVT::i64));
6199 case Intrinsic::aarch64_sve_whilewr_s:
6200 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6201 Op.getOperand(1), Op.getOperand(2),
6202 DAG.getConstant(4, DL, MVT::i64));
6203 case Intrinsic::aarch64_sve_whilewr_d:
6204 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6205 Op.getOperand(1), Op.getOperand(2),
6206 DAG.getConstant(8, DL, MVT::i64));
6207 case Intrinsic::aarch64_sve_whilerw_b:
6208 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6209 Op.getOperand(1), Op.getOperand(2),
6210 DAG.getConstant(1, DL, MVT::i64));
6211 case Intrinsic::aarch64_sve_whilerw_h:
6212 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6213 Op.getOperand(1), Op.getOperand(2),
6214 DAG.getConstant(2, DL, MVT::i64));
6215 case Intrinsic::aarch64_sve_whilerw_s:
6216 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6217 Op.getOperand(1), Op.getOperand(2),
6218 DAG.getConstant(4, DL, MVT::i64));
6219 case Intrinsic::aarch64_sve_whilerw_d:
6220 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6221 Op.getOperand(1), Op.getOperand(2),
6222 DAG.getConstant(8, DL, MVT::i64));
6223 case Intrinsic::aarch64_neon_abs: {
6224 EVT Ty = Op.getValueType();
6225 if (Ty == MVT::i64) {
6226 SDValue Result =
6227 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6228 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6229 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6230 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6231 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6232 } else {
6233 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6234 }
6235 }
6236 case Intrinsic::aarch64_neon_pmull64: {
6237 SDValue LHS = Op.getOperand(1);
6238 SDValue RHS = Op.getOperand(2);
6239
6240 std::optional<uint64_t> LHSLane =
6242 std::optional<uint64_t> RHSLane =
6244
6245 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6246 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6247
6248 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6249 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6250 // which ISel recognizes better. For example, generate a ldr into d*
6251 // registers as opposed to a GPR load followed by a fmov.
6252 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6253 std::optional<uint64_t> OtherLane,
6254 const SDLoc &DL,
6255 SelectionDAG &DAG) -> SDValue {
6256 // If the operand is an higher half itself, rewrite it to
6257 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6258 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6259 if (NLane == 1)
6260 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6261 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6262
6263 // Operand N is not a higher half but the other operand is.
6264 if (OtherLane == 1) {
6265 // If this operand is a lower half, rewrite it to
6266 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6267 // align lanes of two operands. A roundtrip sequence (to move from lane
6268 // 1 to lane 0) is like this:
6269 // mov x8, v0.d[1]
6270 // fmov d0, x8
6271 if (NLane == 0)
6272 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6273 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6274 N.getOperand(0),
6275 DAG.getConstant(0, DL, MVT::i64)),
6276 DAG.getConstant(1, DL, MVT::i64));
6277
6278 // Otherwise just dup from main to all lanes.
6279 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6280 }
6281
6282 // Neither operand is an extract of higher half, so codegen may just use
6283 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6284 assert(N.getValueType() == MVT::i64 &&
6285 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6286 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6287 };
6288
6289 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6290 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6291
6292 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6293 }
6294 case Intrinsic::aarch64_neon_smax:
6295 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6296 Op.getOperand(2));
6297 case Intrinsic::aarch64_neon_umax:
6298 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6299 Op.getOperand(2));
6300 case Intrinsic::aarch64_neon_smin:
6301 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6302 Op.getOperand(2));
6303 case Intrinsic::aarch64_neon_umin:
6304 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6305 Op.getOperand(2));
6306 case Intrinsic::aarch64_neon_scalar_sqxtn:
6307 case Intrinsic::aarch64_neon_scalar_sqxtun:
6308 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6309 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6310 if (Op.getValueType() == MVT::i32)
6311 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6312 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6313 Op.getOperand(0),
6314 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6315 Op.getOperand(1))));
6316 return SDValue();
6317 }
6318 case Intrinsic::aarch64_neon_sqxtn:
6319 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6320 Op.getOperand(1));
6321 case Intrinsic::aarch64_neon_sqxtun:
6322 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6323 Op.getOperand(1));
6324 case Intrinsic::aarch64_neon_uqxtn:
6325 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6326 Op.getOperand(1));
6327 case Intrinsic::aarch64_neon_sqshrn:
6328 if (Op.getValueType().isVector())
6329 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6330 DAG.getNode(AArch64ISD::VASHR, DL,
6331 Op.getOperand(1).getValueType(),
6332 Op.getOperand(1), Op.getOperand(2)));
6333 return SDValue();
6334 case Intrinsic::aarch64_neon_sqshrun:
6335 if (Op.getValueType().isVector())
6336 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6337 DAG.getNode(AArch64ISD::VASHR, DL,
6338 Op.getOperand(1).getValueType(),
6339 Op.getOperand(1), Op.getOperand(2)));
6340 return SDValue();
6341 case Intrinsic::aarch64_neon_uqshrn:
6342 if (Op.getValueType().isVector())
6343 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6344 DAG.getNode(AArch64ISD::VLSHR, DL,
6345 Op.getOperand(1).getValueType(),
6346 Op.getOperand(1), Op.getOperand(2)));
6347 return SDValue();
6348 case Intrinsic::aarch64_neon_sqrshrn:
6349 if (Op.getValueType().isVector())
6350 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6351 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6352 Op.getOperand(1).getValueType(),
6353 Op.getOperand(1), Op.getOperand(2)));
6354 return SDValue();
6355 case Intrinsic::aarch64_neon_sqrshrun:
6356 if (Op.getValueType().isVector())
6357 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6358 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6359 Op.getOperand(1).getValueType(),
6360 Op.getOperand(1), Op.getOperand(2)));
6361 return SDValue();
6362 case Intrinsic::aarch64_neon_uqrshrn:
6363 if (Op.getValueType().isVector())
6364 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6365 DAG.getNode(AArch64ISD::URSHR_I, DL,
6366 Op.getOperand(1).getValueType(),
6367 Op.getOperand(1), Op.getOperand(2)));
6368 return SDValue();
6369 case Intrinsic::aarch64_neon_sqadd:
6370 if (Op.getValueType().isVector())
6371 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6372 Op.getOperand(2));
6373 return SDValue();
6374 case Intrinsic::aarch64_neon_sqsub:
6375 if (Op.getValueType().isVector())
6376 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6377 Op.getOperand(2));
6378 return SDValue();
6379 case Intrinsic::aarch64_neon_uqadd:
6380 if (Op.getValueType().isVector())
6381 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6382 Op.getOperand(2));
6383 return SDValue();
6384 case Intrinsic::aarch64_neon_uqsub:
6385 if (Op.getValueType().isVector())
6386 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6387 Op.getOperand(2));
6388 return SDValue();
6389 case Intrinsic::aarch64_sve_whilelt:
6390 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6391 /*IsEqual=*/false);
6392 case Intrinsic::aarch64_sve_whilels:
6393 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6394 /*IsEqual=*/true);
6395 case Intrinsic::aarch64_sve_whilele:
6396 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6397 /*IsEqual=*/true);
6398 case Intrinsic::aarch64_sve_sunpkhi:
6399 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6400 Op.getOperand(1));
6401 case Intrinsic::aarch64_sve_sunpklo:
6402 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6403 Op.getOperand(1));
6404 case Intrinsic::aarch64_sve_uunpkhi:
6405 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6406 Op.getOperand(1));
6407 case Intrinsic::aarch64_sve_uunpklo:
6408 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6409 Op.getOperand(1));
6410 case Intrinsic::aarch64_sve_clasta_n:
6411 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6412 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6413 case Intrinsic::aarch64_sve_clastb_n:
6414 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6415 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6416 case Intrinsic::aarch64_sve_lasta:
6417 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6418 Op.getOperand(1), Op.getOperand(2));
6419 case Intrinsic::aarch64_sve_lastb:
6420 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6421 Op.getOperand(1), Op.getOperand(2));
6422 case Intrinsic::aarch64_sve_rev:
6423 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6424 Op.getOperand(1));
6425 case Intrinsic::aarch64_sve_tbl:
6426 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6427 Op.getOperand(2));
6428 case Intrinsic::aarch64_sve_trn1:
6429 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6430 Op.getOperand(1), Op.getOperand(2));
6431 case Intrinsic::aarch64_sve_trn2:
6432 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6433 Op.getOperand(1), Op.getOperand(2));
6434 case Intrinsic::aarch64_sve_uzp1:
6435 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6436 Op.getOperand(1), Op.getOperand(2));
6437 case Intrinsic::aarch64_sve_uzp2:
6438 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6439 Op.getOperand(1), Op.getOperand(2));
6440 case Intrinsic::aarch64_sve_zip1:
6441 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6442 Op.getOperand(1), Op.getOperand(2));
6443 case Intrinsic::aarch64_sve_zip2:
6444 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6445 Op.getOperand(1), Op.getOperand(2));
6446 case Intrinsic::aarch64_sve_splice:
6447 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6448 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6449 case Intrinsic::aarch64_sve_ptrue:
6450 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6451 case Intrinsic::aarch64_sve_clz:
6452 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6453 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6454 case Intrinsic::aarch64_sme_cntsd: {
6455 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6456 DAG.getConstant(1, DL, MVT::i32));
6457 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6458 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6459 }
6460 case Intrinsic::aarch64_sve_cnt: {
6461 SDValue Data = Op.getOperand(3);
6462 // CTPOP only supports integer operands.
6463 if (Data.getValueType().isFloatingPoint())
6464 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6465 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6466 Op.getOperand(2), Data, Op.getOperand(1));
6467 }
6468 case Intrinsic::aarch64_sve_dupq_lane:
6469 return LowerDUPQLane(Op, DAG);
6470 case Intrinsic::aarch64_sve_convert_from_svbool:
6471 if (Op.getValueType() == MVT::aarch64svcount)
6472 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6473 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6474 case Intrinsic::aarch64_sve_convert_to_svbool:
6475 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6476 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6477 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6478 case Intrinsic::aarch64_sve_fneg:
6479 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6480 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6481 case Intrinsic::aarch64_sve_frintp:
6482 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6483 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6484 case Intrinsic::aarch64_sve_frintm:
6485 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6486 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6487 case Intrinsic::aarch64_sve_frinti:
6488 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6489 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6490 Op.getOperand(1));
6491 case Intrinsic::aarch64_sve_frintx:
6492 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6493 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6494 case Intrinsic::aarch64_sve_frinta:
6495 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6496 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6497 case Intrinsic::aarch64_sve_frintn:
6498 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6499 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6500 Op.getOperand(1));
6501 case Intrinsic::aarch64_sve_frintz:
6502 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6503 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6504 case Intrinsic::aarch64_sve_ucvtf:
6505 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6506 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6507 Op.getOperand(1));
6508 case Intrinsic::aarch64_sve_scvtf:
6509 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6510 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6511 Op.getOperand(1));
6512 case Intrinsic::aarch64_sve_fcvtzu:
6513 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6514 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6515 case Intrinsic::aarch64_sve_fcvtzs:
6516 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6517 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6518 case Intrinsic::aarch64_sve_fsqrt:
6519 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6520 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6521 case Intrinsic::aarch64_sve_frecpx:
6522 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6523 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6524 case Intrinsic::aarch64_sve_frecpe_x:
6525 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6526 Op.getOperand(1));
6527 case Intrinsic::aarch64_sve_frecps_x:
6528 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6529 Op.getOperand(1), Op.getOperand(2));
6530 case Intrinsic::aarch64_sve_frsqrte_x:
6531 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6532 Op.getOperand(1));
6533 case Intrinsic::aarch64_sve_frsqrts_x:
6534 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6535 Op.getOperand(1), Op.getOperand(2));
6536 case Intrinsic::aarch64_sve_fabs:
6537 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6538 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6539 case Intrinsic::aarch64_sve_abs:
6540 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6541 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6542 case Intrinsic::aarch64_sve_neg:
6543 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6544 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6545 case Intrinsic::aarch64_sve_insr: {
6546 SDValue Scalar = Op.getOperand(2);
6547 EVT ScalarTy = Scalar.getValueType();
6548 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6549 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6550
6551 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6552 Op.getOperand(1), Scalar);
6553 }
6554 case Intrinsic::aarch64_sve_rbit:
6555 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6556 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6557 Op.getOperand(1));
6558 case Intrinsic::aarch64_sve_revb:
6559 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6560 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6561 case Intrinsic::aarch64_sve_revh:
6562 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6563 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6564 case Intrinsic::aarch64_sve_revw:
6565 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6566 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6567 case Intrinsic::aarch64_sve_revd:
6568 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6569 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6570 case Intrinsic::aarch64_sve_sxtb:
6571 return DAG.getNode(
6572 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6573 Op.getOperand(2), Op.getOperand(3),
6574 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6575 Op.getOperand(1));
6576 case Intrinsic::aarch64_sve_sxth:
6577 return DAG.getNode(
6578 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6579 Op.getOperand(2), Op.getOperand(3),
6580 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6581 Op.getOperand(1));
6582 case Intrinsic::aarch64_sve_sxtw:
6583 return DAG.getNode(
6584 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6585 Op.getOperand(2), Op.getOperand(3),
6586 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6587 Op.getOperand(1));
6588 case Intrinsic::aarch64_sve_uxtb:
6589 return DAG.getNode(
6590 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6591 Op.getOperand(2), Op.getOperand(3),
6592 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6593 Op.getOperand(1));
6594 case Intrinsic::aarch64_sve_uxth:
6595 return DAG.getNode(
6596 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6597 Op.getOperand(2), Op.getOperand(3),
6598 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6599 Op.getOperand(1));
6600 case Intrinsic::aarch64_sve_uxtw:
6601 return DAG.getNode(
6602 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6603 Op.getOperand(2), Op.getOperand(3),
6604 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6605 Op.getOperand(1));
6606 case Intrinsic::localaddress: {
6607 const auto &MF = DAG.getMachineFunction();
6608 const auto *RegInfo = Subtarget->getRegisterInfo();
6609 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6610 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6611 Op.getSimpleValueType());
6612 }
6613
6614 case Intrinsic::eh_recoverfp: {
6615 // FIXME: This needs to be implemented to correctly handle highly aligned
6616 // stack objects. For now we simply return the incoming FP. Refer D53541
6617 // for more details.
6618 SDValue FnOp = Op.getOperand(1);
6619 SDValue IncomingFPOp = Op.getOperand(2);
6620 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6621 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6622 if (!Fn)
6624 "llvm.eh.recoverfp must take a function as the first argument");
6625 return IncomingFPOp;
6626 }
6627
6628 case Intrinsic::aarch64_neon_vsri:
6629 case Intrinsic::aarch64_neon_vsli:
6630 case Intrinsic::aarch64_sve_sri:
6631 case Intrinsic::aarch64_sve_sli: {
6632 EVT Ty = Op.getValueType();
6633
6634 if (!Ty.isVector())
6635 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6636
6637 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6638
6639 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6640 IntNo == Intrinsic::aarch64_sve_sri;
6641 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6642 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6643 Op.getOperand(3));
6644 }
6645
6646 case Intrinsic::aarch64_neon_srhadd:
6647 case Intrinsic::aarch64_neon_urhadd:
6648 case Intrinsic::aarch64_neon_shadd:
6649 case Intrinsic::aarch64_neon_uhadd: {
6650 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6651 IntNo == Intrinsic::aarch64_neon_shadd);
6652 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6653 IntNo == Intrinsic::aarch64_neon_urhadd);
6654 unsigned Opcode = IsSignedAdd
6655 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6656 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6657 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6658 Op.getOperand(2));
6659 }
6660 case Intrinsic::aarch64_neon_saddlp:
6661 case Intrinsic::aarch64_neon_uaddlp: {
6662 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6663 ? AArch64ISD::UADDLP
6664 : AArch64ISD::SADDLP;
6665 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6666 }
6667 case Intrinsic::aarch64_neon_sdot:
6668 case Intrinsic::aarch64_neon_udot:
6669 case Intrinsic::aarch64_sve_sdot:
6670 case Intrinsic::aarch64_sve_udot: {
6671 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6672 IntNo == Intrinsic::aarch64_sve_udot)
6673 ? AArch64ISD::UDOT
6674 : AArch64ISD::SDOT;
6675 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6676 Op.getOperand(2), Op.getOperand(3));
6677 }
6678 case Intrinsic::aarch64_neon_usdot:
6679 case Intrinsic::aarch64_sve_usdot: {
6680 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6681 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6682 }
6683 case Intrinsic::aarch64_neon_saddlv:
6684 case Intrinsic::aarch64_neon_uaddlv: {
6685 EVT OpVT = Op.getOperand(1).getValueType();
6686 EVT ResVT = Op.getValueType();
6687 assert(
6688 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6689 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6690 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6691 "Unexpected aarch64_neon_u/saddlv type");
6692 (void)OpVT;
6693 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6694 SDValue ADDLV = DAG.getNode(
6695 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6696 : AArch64ISD::SADDLV,
6697 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6698 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6699 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6700 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6701 return EXTRACT_VEC_ELT;
6702 }
6703 case Intrinsic::experimental_cttz_elts: {
6704 SDValue CttzOp = Op.getOperand(1);
6705 EVT VT = CttzOp.getValueType();
6706 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6707
6708 if (VT.isFixedLengthVector()) {
6709 // We can use SVE instructions to lower this intrinsic by first creating
6710 // an SVE predicate register mask from the fixed-width vector.
6711 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6712 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6713 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6714 }
6715
6716 SDValue NewCttzElts =
6717 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6718 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6719 }
6720 case Intrinsic::experimental_vector_match: {
6721 return LowerVectorMatch(Op, DAG);
6722 }
6723 }
6724}
6725
6726bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6727 if (VT.getVectorElementType() == MVT::i8 ||
6728 VT.getVectorElementType() == MVT::i16) {
6729 EltTy = MVT::i32;
6730 return true;
6731 }
6732 return false;
6733}
6734
6735bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6736 EVT DataVT) const {
6737 const EVT IndexVT = Extend.getOperand(0).getValueType();
6738 // SVE only supports implicit extension of 32-bit indices.
6739 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6740 return false;
6741
6742 // Indices cannot be smaller than the main data type.
6743 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6744 return false;
6745
6746 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6747 // element container type, which would violate the previous clause.
6748 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6749}
6750
6751bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6752 EVT ExtVT = ExtVal.getValueType();
6753 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6754 return false;
6755
6756 // It may be worth creating extending masked loads if there are multiple
6757 // masked loads using the same predicate. That way we'll end up creating
6758 // extending masked loads that may then get split by the legaliser. This
6759 // results in just one set of predicate unpacks at the start, instead of
6760 // multiple sets of vector unpacks after each load.
6761 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6762 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6763 // Disable extending masked loads for fixed-width for now, since the code
6764 // quality doesn't look great.
6765 if (!ExtVT.isScalableVector())
6766 return false;
6767
6768 unsigned NumExtMaskedLoads = 0;
6769 for (auto *U : Ld->getMask()->users())
6770 if (isa<MaskedLoadSDNode>(U))
6771 NumExtMaskedLoads++;
6772
6773 if (NumExtMaskedLoads <= 1)
6774 return false;
6775 }
6776 }
6777
6778 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6779 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6780 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6781}
6782
6783unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6784 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6785 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6786 AArch64ISD::GLD1_MERGE_ZERO},
6787 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6788 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6789 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6790 AArch64ISD::GLD1_MERGE_ZERO},
6791 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6792 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6793 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6794 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6795 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6796 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6797 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6798 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6799 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6800 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6801 };
6802 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6803 return AddrModes.find(Key)->second;
6804}
6805
6806unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6807 switch (Opcode) {
6808 default:
6809 llvm_unreachable("unimplemented opcode");
6810 return Opcode;
6811 case AArch64ISD::GLD1_MERGE_ZERO:
6812 return AArch64ISD::GLD1S_MERGE_ZERO;
6813 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6814 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6815 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6816 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6817 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6818 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6819 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6820 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6821 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6822 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6823 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6824 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6825 }
6826}
6827
6828SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6829 SelectionDAG &DAG) const {
6830 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6831
6832 SDLoc DL(Op);
6833 SDValue Chain = MGT->getChain();
6834 SDValue PassThru = MGT->getPassThru();
6835 SDValue Mask = MGT->getMask();
6836 SDValue BasePtr = MGT->getBasePtr();
6837 SDValue Index = MGT->getIndex();
6838 SDValue Scale = MGT->getScale();
6839 EVT VT = Op.getValueType();
6840 EVT MemVT = MGT->getMemoryVT();
6841 ISD::LoadExtType ExtType = MGT->getExtensionType();
6842 ISD::MemIndexType IndexType = MGT->getIndexType();
6843
6844 // SVE supports zero (and so undef) passthrough values only, everything else
6845 // must be handled manually by an explicit select on the load's output.
6846 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6847 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6848 SDValue Load =
6849 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6850 MGT->getMemOperand(), IndexType, ExtType);
6851 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6852 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6853 }
6854
6855 bool IsScaled = MGT->isIndexScaled();
6856 bool IsSigned = MGT->isIndexSigned();
6857
6858 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6859 // must be calculated before hand.
6860 uint64_t ScaleVal = Scale->getAsZExtVal();
6861 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6862 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6863 EVT IndexVT = Index.getValueType();
6864 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6865 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6866 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6867
6868 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6869 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6870 MGT->getMemOperand(), IndexType, ExtType);
6871 }
6872
6873 // Lower fixed length gather to a scalable equivalent.
6874 if (VT.isFixedLengthVector()) {
6875 assert(Subtarget->useSVEForFixedLengthVectors() &&
6876 "Cannot lower when not using SVE for fixed vectors!");
6877
6878 // NOTE: Handle floating-point as if integer then bitcast the result.
6879 EVT DataVT = VT.changeVectorElementTypeToInteger();
6880 MemVT = MemVT.changeVectorElementTypeToInteger();
6881
6882 // Find the smallest integer fixed length vector we can use for the gather.
6883 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6884 if (DataVT.getVectorElementType() == MVT::i64 ||
6885 Index.getValueType().getVectorElementType() == MVT::i64 ||
6886 Mask.getValueType().getVectorElementType() == MVT::i64)
6887 PromotedVT = VT.changeVectorElementType(MVT::i64);
6888
6889 // Promote vector operands except for passthrough, which we know is either
6890 // undef or zero, and thus best constructed directly.
6891 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6892 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6893 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6894
6895 // A promoted result type forces the need for an extending load.
6896 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6897 ExtType = ISD::EXTLOAD;
6898
6899 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6900
6901 // Convert fixed length vector operands to scalable.
6902 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6903 Index = convertToScalableVector(DAG, ContainerVT, Index);
6905 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6906 : DAG.getConstant(0, DL, ContainerVT);
6907
6908 // Emit equivalent scalable vector gather.
6909 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6910 SDValue Load =
6911 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6912 Ops, MGT->getMemOperand(), IndexType, ExtType);
6913
6914 // Extract fixed length data then convert to the required result type.
6915 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6916 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6917 if (VT.isFloatingPoint())
6918 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6919
6920 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6921 }
6922
6923 // Everything else is legal.
6924 return Op;
6925}
6926
6927SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6928 SelectionDAG &DAG) const {
6929 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6930
6931 SDLoc DL(Op);
6932 SDValue Chain = MSC->getChain();
6933 SDValue StoreVal = MSC->getValue();
6934 SDValue Mask = MSC->getMask();
6935 SDValue BasePtr = MSC->getBasePtr();
6936 SDValue Index = MSC->getIndex();
6937 SDValue Scale = MSC->getScale();
6938 EVT VT = StoreVal.getValueType();
6939 EVT MemVT = MSC->getMemoryVT();
6940 ISD::MemIndexType IndexType = MSC->getIndexType();
6941 bool Truncating = MSC->isTruncatingStore();
6942
6943 bool IsScaled = MSC->isIndexScaled();
6944 bool IsSigned = MSC->isIndexSigned();
6945
6946 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6947 // must be calculated before hand.
6948 uint64_t ScaleVal = Scale->getAsZExtVal();
6949 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6950 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6951 EVT IndexVT = Index.getValueType();
6952 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6953 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6954 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6955
6956 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6957 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6958 MSC->getMemOperand(), IndexType, Truncating);
6959 }
6960
6961 // Lower fixed length scatter to a scalable equivalent.
6962 if (VT.isFixedLengthVector()) {
6963 assert(Subtarget->useSVEForFixedLengthVectors() &&
6964 "Cannot lower when not using SVE for fixed vectors!");
6965
6966 // Once bitcast we treat floating-point scatters as if integer.
6967 if (VT.isFloatingPoint()) {
6969 MemVT = MemVT.changeVectorElementTypeToInteger();
6970 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6971 }
6972
6973 // Find the smallest integer fixed length vector we can use for the scatter.
6974 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6975 if (VT.getVectorElementType() == MVT::i64 ||
6976 Index.getValueType().getVectorElementType() == MVT::i64 ||
6977 Mask.getValueType().getVectorElementType() == MVT::i64)
6978 PromotedVT = VT.changeVectorElementType(MVT::i64);
6979
6980 // Promote vector operands.
6981 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6982 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6983 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6984 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6985
6986 // A promoted value type forces the need for a truncating store.
6987 if (PromotedVT != VT)
6988 Truncating = true;
6989
6990 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6991
6992 // Convert fixed length vector operands to scalable.
6993 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6994 Index = convertToScalableVector(DAG, ContainerVT, Index);
6996 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6997
6998 // Emit equivalent scalable vector scatter.
6999 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7000 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7001 MSC->getMemOperand(), IndexType, Truncating);
7002 }
7003
7004 // Everything else is legal.
7005 return Op;
7006}
7007
7008SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7009 SDLoc DL(Op);
7010 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7011 assert(LoadNode && "Expected custom lowering of a masked load node");
7012 EVT VT = Op->getValueType(0);
7013
7014 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7015 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7016
7017 SDValue PassThru = LoadNode->getPassThru();
7018 SDValue Mask = LoadNode->getMask();
7019
7020 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7021 return Op;
7022
7024 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7025 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7026 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7027 LoadNode->getExtensionType());
7028
7029 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7030
7031 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7032}
7033
7034// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7036 EVT VT, EVT MemVT,
7037 SelectionDAG &DAG) {
7038 assert(VT.isVector() && "VT should be a vector type");
7039 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7040
7041 SDValue Value = ST->getValue();
7042
7043 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7044 // the word lane which represent the v4i8 subvector. It optimizes the store
7045 // to:
7046 //
7047 // xtn v0.8b, v0.8h
7048 // str s0, [x0]
7049
7050 SDValue Undef = DAG.getUNDEF(MVT::i16);
7051 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7052 {Undef, Undef, Undef, Undef});
7053
7054 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7055 Value, UndefVec);
7056 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7057
7058 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7059 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7060 Trunc, DAG.getConstant(0, DL, MVT::i64));
7061
7062 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7063 ST->getBasePtr(), ST->getMemOperand());
7064}
7065
7067 SDLoc DL(Op);
7068 SDValue Src = Op.getOperand(0);
7069 MVT DestVT = Op.getSimpleValueType();
7070 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7072
7073 unsigned SrcAS = N->getSrcAddressSpace();
7074 unsigned DestAS = N->getDestAddressSpace();
7075 assert(SrcAS != DestAS &&
7076 "addrspacecast must be between different address spaces");
7077 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7078 TLI.getTargetMachine().getPointerSize(DestAS) &&
7079 "addrspacecast must be between different ptr sizes");
7080 (void)TLI;
7081
7082 if (SrcAS == ARM64AS::PTR32_SPTR) {
7083 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7084 DAG.getTargetConstant(0, DL, DestVT));
7085 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7086 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7087 DAG.getTargetConstant(0, DL, DestVT));
7088 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7089 (DestAS == ARM64AS::PTR32_UPTR)) {
7090 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7091 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7092 return Trunc;
7093 } else {
7094 return Src;
7095 }
7096}
7097
7098// Custom lowering for any store, vector or scalar and/or default or with
7099// a truncate operations. Currently only custom lower truncate operation
7100// from vector v4i16 to v4i8 or volatile stores of i128.
7101SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7102 SelectionDAG &DAG) const {
7103 SDLoc Dl(Op);
7104 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7105 assert (StoreNode && "Can only custom lower store nodes");
7106
7107 SDValue Value = StoreNode->getValue();
7108
7109 EVT VT = Value.getValueType();
7110 EVT MemVT = StoreNode->getMemoryVT();
7111
7112 if (VT.isVector()) {
7114 VT,
7115 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7116 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7117
7118 unsigned AS = StoreNode->getAddressSpace();
7119 Align Alignment = StoreNode->getAlign();
7120 if (Alignment < MemVT.getStoreSize() &&
7121 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7122 StoreNode->getMemOperand()->getFlags(),
7123 nullptr)) {
7124 return scalarizeVectorStore(StoreNode, DAG);
7125 }
7126
7127 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7128 MemVT == MVT::v4i8) {
7129 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7130 }
7131 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7132 // the custom lowering, as there are no un-paired non-temporal stores and
7133 // legalization will break up 256 bit inputs.
7134 ElementCount EC = MemVT.getVectorElementCount();
7135 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7136 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7137 (MemVT.getScalarSizeInBits() == 8u ||
7138 MemVT.getScalarSizeInBits() == 16u ||
7139 MemVT.getScalarSizeInBits() == 32u ||
7140 MemVT.getScalarSizeInBits() == 64u)) {
7141 SDValue Lo =
7144 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7145 SDValue Hi =
7148 StoreNode->getValue(),
7149 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7151 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7152 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7153 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7154 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7155 return Result;
7156 }
7157 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7158 return LowerStore128(Op, DAG);
7159 } else if (MemVT == MVT::i64x8) {
7160 SDValue Value = StoreNode->getValue();
7161 assert(Value->getValueType(0) == MVT::i64x8);
7162 SDValue Chain = StoreNode->getChain();
7163 SDValue Base = StoreNode->getBasePtr();
7164 EVT PtrVT = Base.getValueType();
7165 for (unsigned i = 0; i < 8; i++) {
7166 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7167 Value, DAG.getConstant(i, Dl, MVT::i32));
7168 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7169 DAG.getConstant(i * 8, Dl, PtrVT));
7170 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7171 StoreNode->getBaseAlign());
7172 }
7173 return Chain;
7174 }
7175
7176 return SDValue();
7177}
7178
7179/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7180SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7181 SelectionDAG &DAG) const {
7182 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7183 assert(StoreNode->getMemoryVT() == MVT::i128);
7184 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7185
7186 bool IsStoreRelease =
7188 if (StoreNode->isAtomic())
7189 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7190 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7193
7194 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7195 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7196 ? StoreNode->getOperand(1)
7197 : StoreNode->getOperand(2);
7198 SDLoc DL(Op);
7199 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7200 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7201 if (DAG.getDataLayout().isBigEndian())
7202 std::swap(StoreValue.first, StoreValue.second);
7204 Opcode, DL, DAG.getVTList(MVT::Other),
7205 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7206 StoreNode->getBasePtr()},
7207 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7208 return Result;
7209}
7210
7211SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7212 SelectionDAG &DAG) const {
7213 SDLoc DL(Op);
7214 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7215 assert(LoadNode && "Expected custom lowering of a load node");
7216
7217 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7219 SDValue Base = LoadNode->getBasePtr();
7220 SDValue Chain = LoadNode->getChain();
7221 EVT PtrVT = Base.getValueType();
7222 for (unsigned i = 0; i < 8; i++) {
7223 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7224 DAG.getConstant(i * 8, DL, PtrVT));
7225 SDValue Part =
7226 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7227 LoadNode->getBaseAlign());
7228 Ops.push_back(Part);
7229 Chain = SDValue(Part.getNode(), 1);
7230 }
7231 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7232 return DAG.getMergeValues({Loaded, Chain}, DL);
7233 }
7234
7235 // Custom lowering for extending v4i8 vector loads.
7236 EVT VT = Op->getValueType(0);
7237 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7238
7239 if (LoadNode->getMemoryVT() != MVT::v4i8)
7240 return SDValue();
7241
7242 // Avoid generating unaligned loads.
7243 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7244 return SDValue();
7245
7246 unsigned ExtType;
7247 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7248 ExtType = ISD::SIGN_EXTEND;
7249 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7250 LoadNode->getExtensionType() == ISD::EXTLOAD)
7251 ExtType = ISD::ZERO_EXTEND;
7252 else
7253 return SDValue();
7254
7255 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7256 LoadNode->getBasePtr(), MachinePointerInfo());
7257 SDValue Chain = Load.getValue(1);
7258 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7259 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7260 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7261 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7262 DAG.getConstant(0, DL, MVT::i64));
7263 if (VT == MVT::v4i32)
7264 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7265 return DAG.getMergeValues({Ext, Chain}, DL);
7266}
7267
7268SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7269 SelectionDAG &DAG) const {
7270 SDLoc DL(Op);
7271 SDValue Vec = Op.getOperand(0);
7272 SDValue Mask = Op.getOperand(1);
7273 SDValue Passthru = Op.getOperand(2);
7274 EVT VecVT = Vec.getValueType();
7275 EVT MaskVT = Mask.getValueType();
7276 EVT ElmtVT = VecVT.getVectorElementType();
7277 const bool IsFixedLength = VecVT.isFixedLengthVector();
7278 const bool HasPassthru = !Passthru.isUndef();
7279 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7280 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7281
7282 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7283
7284 if (!Subtarget->isSVEAvailable())
7285 return SDValue();
7286
7287 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7288 return SDValue();
7289
7290 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7291 if (MinElmts != 2 && MinElmts != 4)
7292 return SDValue();
7293
7294 // We can use the SVE register containing the NEON vector in its lowest bits.
7295 if (IsFixedLength) {
7296 EVT ScalableVecVT =
7297 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7298 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7299 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7300
7301 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7302 DAG.getUNDEF(ScalableVecVT), Vec,
7303 DAG.getConstant(0, DL, MVT::i64));
7304 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7305 DAG.getUNDEF(ScalableMaskVT), Mask,
7306 DAG.getConstant(0, DL, MVT::i64));
7308 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7309 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7310 DAG.getUNDEF(ScalableVecVT), Passthru,
7311 DAG.getConstant(0, DL, MVT::i64));
7312
7313 VecVT = Vec.getValueType();
7314 MaskVT = Mask.getValueType();
7315 }
7316
7317 // Get legal type for compact instruction
7318 EVT ContainerVT = getSVEContainerType(VecVT);
7319 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7320
7321 // Convert to i32 or i64 for smaller types, as these are the only supported
7322 // sizes for compact.
7323 if (ContainerVT != VecVT) {
7324 Vec = DAG.getBitcast(CastVT, Vec);
7325 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7326 }
7327
7328 SDValue Compressed = DAG.getNode(
7330 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7331
7332 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7333 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7334 SDValue Offset = DAG.getNode(
7335 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7336 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7337
7338 SDValue IndexMask = DAG.getNode(
7339 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7340 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7341 DAG.getConstant(0, DL, MVT::i64), Offset);
7342
7343 Compressed =
7344 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7345 }
7346
7347 // Extracting from a legal SVE type before truncating produces better code.
7348 if (IsFixedLength) {
7349 Compressed = DAG.getNode(
7351 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7352 Compressed, DAG.getConstant(0, DL, MVT::i64));
7353 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7354 VecVT = FixedVecVT;
7355 }
7356
7357 // If we changed the element type before, we need to convert it back.
7358 if (ContainerVT != VecVT) {
7359 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7360 Compressed = DAG.getBitcast(VecVT, Compressed);
7361 }
7362
7363 return Compressed;
7364}
7365
7366// Generate SUBS and CSEL for integer abs.
7367SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7368 MVT VT = Op.getSimpleValueType();
7369
7370 if (VT.isVector())
7371 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7372
7373 SDLoc DL(Op);
7374 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7375 Op.getOperand(0));
7376 // Generate SUBS & CSEL.
7377 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7378 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7379 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7380 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7381}
7382
7384 SDValue Chain = Op.getOperand(0);
7385 SDValue Cond = Op.getOperand(1);
7386 SDValue Dest = Op.getOperand(2);
7387
7389 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7390 SDLoc DL(Op);
7391 SDValue CCVal = getCondCode(DAG, CC);
7392 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7393 Cmp);
7394 }
7395
7396 return SDValue();
7397}
7398
7399// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7400// FSHL is converted to FSHR before deciding what to do with it
7402 SDValue Shifts = Op.getOperand(2);
7403 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7404 // If opcode is FSHL, convert it to FSHR
7405 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7406 SDLoc DL(Op);
7407 MVT VT = Op.getSimpleValueType();
7408 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7409
7410 if (Op.getOpcode() == ISD::FSHL) {
7411 if (NewShiftNo == 0)
7412 return Op.getOperand(0);
7413
7414 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7415 return DAG.getNode(
7416 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7417 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7418 }
7419
7420 if (Op.getOpcode() == ISD::FSHR) {
7421 if (NewShiftNo == 0)
7422 return Op.getOperand(1);
7423
7424 if (ShiftNo->getZExtValue() == NewShiftNo)
7425 return Op;
7426
7427 // Rewrite using the normalised shift amount.
7428 return DAG.getNode(
7429 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7430 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7431 }
7432 }
7433
7434 return SDValue();
7435}
7436
7438 SDValue X = Op.getOperand(0);
7439 EVT XScalarTy = X.getValueType();
7440 SDValue Exp = Op.getOperand(1);
7441
7442 SDLoc DL(Op);
7443 EVT XVT, ExpVT;
7444 switch (Op.getSimpleValueType().SimpleTy) {
7445 default:
7446 return SDValue();
7447 case MVT::bf16:
7448 case MVT::f16:
7449 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7450 [[fallthrough]];
7451 case MVT::f32:
7452 XVT = MVT::nxv4f32;
7453 ExpVT = MVT::nxv4i32;
7454 break;
7455 case MVT::f64:
7456 XVT = MVT::nxv2f64;
7457 ExpVT = MVT::nxv2i64;
7458 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7459 break;
7460 }
7461
7462 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7463 SDValue VX =
7464 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7465 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7466 DAG.getUNDEF(ExpVT), Exp, Zero);
7467 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7468 AArch64SVEPredPattern::all);
7469 SDValue FScale =
7471 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7472 VPg, VX, VExp);
7473 SDValue Final =
7474 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7475 if (X.getValueType() != XScalarTy)
7476 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7477 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7478 return Final;
7479}
7480
7481SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7482 SelectionDAG &DAG) const {
7483 return Op.getOperand(0);
7484}
7485
7486SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7487 SelectionDAG &DAG) const {
7488 SDValue Chain = Op.getOperand(0);
7489 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7490 SDValue FPtr = Op.getOperand(2); // nested function
7491 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7492
7493 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7494
7495 // ldr NestReg, .+16
7496 // ldr x17, .+20
7497 // br x17
7498 // .word 0
7499 // .nest: .qword nest
7500 // .fptr: .qword fptr
7501 SDValue OutChains[5];
7502
7503 const Function *Func =
7504 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7505 CallingConv::ID CC = Func->getCallingConv();
7506 unsigned NestReg;
7507
7508 switch (CC) {
7509 default:
7510 NestReg = 0x0f; // X15
7511 break;
7513 // Must be kept in sync with AArch64CallingConv.td
7514 NestReg = 0x04; // X4
7515 break;
7516 }
7517
7518 const char FptrReg = 0x11; // X17
7519
7520 SDValue Addr = Trmp;
7521
7522 SDLoc DL(Op);
7523 OutChains[0] = DAG.getStore(
7524 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7525 MachinePointerInfo(TrmpAddr));
7526
7527 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7528 DAG.getConstant(4, DL, MVT::i64));
7529 OutChains[1] = DAG.getStore(
7530 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7531 MachinePointerInfo(TrmpAddr, 4));
7532
7533 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7534 DAG.getConstant(8, DL, MVT::i64));
7535 OutChains[2] =
7536 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7537 MachinePointerInfo(TrmpAddr, 8));
7538
7539 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7540 DAG.getConstant(16, DL, MVT::i64));
7541 OutChains[3] =
7542 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7543
7544 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7545 DAG.getConstant(24, DL, MVT::i64));
7546 OutChains[4] =
7547 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7548
7549 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7550
7551 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7552 DAG.getConstant(12, DL, MVT::i64));
7553
7554 // Call clear cache on the trampoline instructions.
7555 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7556 EndOfTrmp);
7557}
7558
7560 SelectionDAG &DAG) const {
7561 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7562 LLVM_DEBUG(Op.dump());
7563
7564 switch (Op.getOpcode()) {
7565 default:
7566 llvm_unreachable("unimplemented operand");
7567 return SDValue();
7570 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7571 case ISD::BITCAST:
7572 return LowerBITCAST(Op, DAG);
7573 case ISD::GlobalAddress:
7574 return LowerGlobalAddress(Op, DAG);
7576 return LowerGlobalTLSAddress(Op, DAG);
7578 return LowerPtrAuthGlobalAddress(Op, DAG);
7579 case ISD::ADJUST_TRAMPOLINE:
7580 return LowerADJUST_TRAMPOLINE(Op, DAG);
7581 case ISD::INIT_TRAMPOLINE:
7582 return LowerINIT_TRAMPOLINE(Op, DAG);
7583 case ISD::SETCC:
7584 case ISD::STRICT_FSETCC:
7586 return LowerSETCC(Op, DAG);
7587 case ISD::SETCCCARRY:
7588 return LowerSETCCCARRY(Op, DAG);
7589 case ISD::BRCOND:
7590 return LowerBRCOND(Op, DAG);
7591 case ISD::BR_CC:
7592 return LowerBR_CC(Op, DAG);
7593 case ISD::SELECT:
7594 return LowerSELECT(Op, DAG);
7595 case ISD::SELECT_CC:
7596 return LowerSELECT_CC(Op, DAG);
7597 case ISD::JumpTable:
7598 return LowerJumpTable(Op, DAG);
7599 case ISD::BR_JT:
7600 return LowerBR_JT(Op, DAG);
7601 case ISD::BRIND:
7602 return LowerBRIND(Op, DAG);
7603 case ISD::ConstantPool:
7604 return LowerConstantPool(Op, DAG);
7605 case ISD::BlockAddress:
7606 return LowerBlockAddress(Op, DAG);
7607 case ISD::VASTART:
7608 return LowerVASTART(Op, DAG);
7609 case ISD::VACOPY:
7610 return LowerVACOPY(Op, DAG);
7611 case ISD::VAARG:
7612 return LowerVAARG(Op, DAG);
7613 case ISD::UADDO_CARRY:
7614 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7615 case ISD::USUBO_CARRY:
7616 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7617 case ISD::SADDO_CARRY:
7618 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7619 case ISD::SSUBO_CARRY:
7620 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7621 case ISD::SADDO:
7622 case ISD::UADDO:
7623 case ISD::SSUBO:
7624 case ISD::USUBO:
7625 case ISD::SMULO:
7626 case ISD::UMULO:
7627 return LowerXALUO(Op, DAG);
7628 case ISD::FADD:
7629 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7630 case ISD::FSUB:
7631 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7632 case ISD::FMUL:
7633 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7634 case ISD::FMA:
7635 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7636 case ISD::FDIV:
7637 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7638 case ISD::FNEG:
7639 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7640 case ISD::FCEIL:
7641 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7642 case ISD::FFLOOR:
7643 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7644 case ISD::FNEARBYINT:
7645 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7646 case ISD::FRINT:
7647 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7648 case ISD::FROUND:
7649 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7650 case ISD::FROUNDEVEN:
7651 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7652 case ISD::FTRUNC:
7653 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7654 case ISD::FSQRT:
7655 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7656 case ISD::FABS:
7657 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7658 case ISD::FP_ROUND:
7660 return LowerFP_ROUND(Op, DAG);
7661 case ISD::FP_EXTEND:
7663 return LowerFP_EXTEND(Op, DAG);
7664 case ISD::FRAMEADDR:
7665 return LowerFRAMEADDR(Op, DAG);
7666 case ISD::SPONENTRY:
7667 return LowerSPONENTRY(Op, DAG);
7668 case ISD::RETURNADDR:
7669 return LowerRETURNADDR(Op, DAG);
7671 return LowerADDROFRETURNADDR(Op, DAG);
7673 return LowerCONCAT_VECTORS(Op, DAG);
7675 return LowerINSERT_VECTOR_ELT(Op, DAG);
7677 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7678 case ISD::BUILD_VECTOR:
7679 return LowerBUILD_VECTOR(Op, DAG);
7681 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7683 return LowerVECTOR_SHUFFLE(Op, DAG);
7684 case ISD::SPLAT_VECTOR:
7685 return LowerSPLAT_VECTOR(Op, DAG);
7687 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7689 return LowerINSERT_SUBVECTOR(Op, DAG);
7690 case ISD::SDIV:
7691 case ISD::UDIV:
7692 return LowerDIV(Op, DAG);
7693 case ISD::SMIN:
7694 case ISD::UMIN:
7695 case ISD::SMAX:
7696 case ISD::UMAX:
7697 return LowerMinMax(Op, DAG);
7698 case ISD::SRA:
7699 case ISD::SRL:
7700 case ISD::SHL:
7701 return LowerVectorSRA_SRL_SHL(Op, DAG);
7702 case ISD::SHL_PARTS:
7703 case ISD::SRL_PARTS:
7704 case ISD::SRA_PARTS:
7705 return LowerShiftParts(Op, DAG);
7706 case ISD::CTPOP:
7707 case ISD::PARITY:
7708 return LowerCTPOP_PARITY(Op, DAG);
7709 case ISD::FCOPYSIGN:
7710 return LowerFCOPYSIGN(Op, DAG);
7711 case ISD::OR:
7712 return LowerVectorOR(Op, DAG);
7713 case ISD::XOR:
7714 return LowerXOR(Op, DAG);
7715 case ISD::PREFETCH:
7716 return LowerPREFETCH(Op, DAG);
7717 case ISD::SINT_TO_FP:
7718 case ISD::UINT_TO_FP:
7721 return LowerINT_TO_FP(Op, DAG);
7722 case ISD::FP_TO_SINT:
7723 case ISD::FP_TO_UINT:
7726 return LowerFP_TO_INT(Op, DAG);
7729 return LowerFP_TO_INT_SAT(Op, DAG);
7730 case ISD::FSINCOS:
7731 return LowerFSINCOS(Op, DAG);
7732 case ISD::GET_ROUNDING:
7733 return LowerGET_ROUNDING(Op, DAG);
7734 case ISD::SET_ROUNDING:
7735 return LowerSET_ROUNDING(Op, DAG);
7736 case ISD::GET_FPMODE:
7737 return LowerGET_FPMODE(Op, DAG);
7738 case ISD::SET_FPMODE:
7739 return LowerSET_FPMODE(Op, DAG);
7740 case ISD::RESET_FPMODE:
7741 return LowerRESET_FPMODE(Op, DAG);
7742 case ISD::MUL:
7743 return LowerMUL(Op, DAG);
7744 case ISD::MULHS:
7745 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7746 case ISD::MULHU:
7747 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7749 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7751 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7753 return LowerINTRINSIC_VOID(Op, DAG);
7754 case ISD::ATOMIC_STORE:
7755 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7756 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7757 return LowerStore128(Op, DAG);
7758 }
7759 return SDValue();
7760 case ISD::STORE:
7761 return LowerSTORE(Op, DAG);
7762 case ISD::MSTORE:
7763 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7764 case ISD::MGATHER:
7765 return LowerMGATHER(Op, DAG);
7766 case ISD::MSCATTER:
7767 return LowerMSCATTER(Op, DAG);
7768 case ISD::VECREDUCE_SEQ_FADD:
7769 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7770 case ISD::VECREDUCE_ADD:
7771 case ISD::VECREDUCE_AND:
7772 case ISD::VECREDUCE_OR:
7773 case ISD::VECREDUCE_XOR:
7774 case ISD::VECREDUCE_SMAX:
7775 case ISD::VECREDUCE_SMIN:
7776 case ISD::VECREDUCE_UMAX:
7777 case ISD::VECREDUCE_UMIN:
7778 case ISD::VECREDUCE_FADD:
7779 case ISD::VECREDUCE_FMAX:
7780 case ISD::VECREDUCE_FMIN:
7781 case ISD::VECREDUCE_FMAXIMUM:
7782 case ISD::VECREDUCE_FMINIMUM:
7783 return LowerVECREDUCE(Op, DAG);
7784 case ISD::ATOMIC_LOAD_AND:
7785 return LowerATOMIC_LOAD_AND(Op, DAG);
7786 case ISD::DYNAMIC_STACKALLOC:
7787 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7788 case ISD::VSCALE:
7789 return LowerVSCALE(Op, DAG);
7791 return LowerVECTOR_COMPRESS(Op, DAG);
7792 case ISD::ANY_EXTEND:
7793 case ISD::SIGN_EXTEND:
7794 case ISD::ZERO_EXTEND:
7795 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7796 case ISD::ADDRSPACECAST:
7797 return LowerADDRSPACECAST(Op, DAG);
7799 // Only custom lower when ExtraVT has a legal byte based element type.
7800 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7801 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7802 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7803 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7804 return SDValue();
7805
7806 return LowerToPredicatedOp(Op, DAG,
7807 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7808 }
7809 case ISD::TRUNCATE:
7810 return LowerTRUNCATE(Op, DAG);
7811 case ISD::MLOAD:
7812 return LowerMLOAD(Op, DAG);
7813 case ISD::LOAD:
7814 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7815 !Subtarget->isNeonAvailable()))
7816 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7817 return LowerLOAD(Op, DAG);
7818 case ISD::ADD:
7819 case ISD::AND:
7820 case ISD::SUB:
7821 return LowerToScalableOp(Op, DAG);
7822 case ISD::FMAXIMUM:
7823 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7824 case ISD::FMAXNUM:
7825 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7826 case ISD::FMINIMUM:
7827 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7828 case ISD::FMINNUM:
7829 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7830 case ISD::VSELECT:
7831 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7832 case ISD::ABS:
7833 return LowerABS(Op, DAG);
7834 case ISD::ABDS:
7835 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7836 case ISD::ABDU:
7837 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7838 case ISD::AVGFLOORS:
7839 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7840 case ISD::AVGFLOORU:
7841 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7842 case ISD::AVGCEILS:
7843 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7844 case ISD::AVGCEILU:
7845 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7846 case ISD::BITREVERSE:
7847 return LowerBitreverse(Op, DAG);
7848 case ISD::BSWAP:
7849 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7850 case ISD::CTLZ:
7851 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7852 case ISD::CTTZ:
7853 return LowerCTTZ(Op, DAG);
7854 case ISD::VECTOR_SPLICE:
7855 return LowerVECTOR_SPLICE(Op, DAG);
7857 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7859 return LowerVECTOR_INTERLEAVE(Op, DAG);
7860 case ISD::GET_ACTIVE_LANE_MASK:
7861 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7862 case ISD::LRINT:
7863 case ISD::LLRINT:
7864 if (Op.getValueType().isVector())
7865 return LowerVectorXRINT(Op, DAG);
7866 [[fallthrough]];
7867 case ISD::LROUND:
7868 case ISD::LLROUND: {
7869 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7870 Op.getOperand(0).getValueType() == MVT::bf16) &&
7871 "Expected custom lowering of rounding operations only for f16");
7872 SDLoc DL(Op);
7873 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7874 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7875 }
7876 case ISD::STRICT_LROUND:
7878 case ISD::STRICT_LRINT:
7879 case ISD::STRICT_LLRINT: {
7880 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7881 Op.getOperand(1).getValueType() == MVT::bf16) &&
7882 "Expected custom lowering of rounding operations only for f16");
7883 SDLoc DL(Op);
7884 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7885 {Op.getOperand(0), Op.getOperand(1)});
7886 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7887 {Ext.getValue(1), Ext.getValue(0)});
7888 }
7889 case ISD::WRITE_REGISTER: {
7890 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7891 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7892 SDLoc DL(Op);
7893
7894 SDValue Chain = Op.getOperand(0);
7895 SDValue SysRegName = Op.getOperand(1);
7896 std::pair<SDValue, SDValue> Pair =
7897 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7898
7899 // chain = MSRR(chain, sysregname, lo, hi)
7900 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7901 SysRegName, Pair.first, Pair.second);
7902
7903 return Result;
7904 }
7905 case ISD::FSHL:
7906 case ISD::FSHR:
7907 return LowerFunnelShift(Op, DAG);
7908 case ISD::FLDEXP:
7909 return LowerFLDEXP(Op, DAG);
7910 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7911 return LowerVECTOR_HISTOGRAM(Op, DAG);
7912 case ISD::PARTIAL_REDUCE_SMLA:
7913 case ISD::PARTIAL_REDUCE_UMLA:
7914 case ISD::PARTIAL_REDUCE_SUMLA:
7915 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7916 }
7917}
7918
7920 return !Subtarget->useSVEForFixedLengthVectors();
7921}
7922
7924 EVT VT, bool OverrideNEON) const {
7925 if (!VT.isFixedLengthVector() || !VT.isSimple())
7926 return false;
7927
7928 // Don't use SVE for vectors we cannot scalarize if required.
7929 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7930 // Fixed length predicates should be promoted to i8.
7931 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7932 case MVT::i1:
7933 default:
7934 return false;
7935 case MVT::i8:
7936 case MVT::i16:
7937 case MVT::i32:
7938 case MVT::i64:
7939 case MVT::f16:
7940 case MVT::f32:
7941 case MVT::f64:
7942 break;
7943 }
7944
7945 // NEON-sized vectors can be emulated using SVE instructions.
7946 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7947 return Subtarget->isSVEorStreamingSVEAvailable();
7948
7949 // Ensure NEON MVTs only belong to a single register class.
7950 if (VT.getFixedSizeInBits() <= 128)
7951 return false;
7952
7953 // Ensure wider than NEON code generation is enabled.
7954 if (!Subtarget->useSVEForFixedLengthVectors())
7955 return false;
7956
7957 // Don't use SVE for types that don't fit.
7958 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7959 return false;
7960
7961 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7962 // the base fixed length SVE support in place.
7963 if (!VT.isPow2VectorType())
7964 return false;
7965
7966 return true;
7967}
7968
7969//===----------------------------------------------------------------------===//
7970// Calling Convention Implementation
7971//===----------------------------------------------------------------------===//
7972
7973static unsigned getIntrinsicID(const SDNode *N) {
7974 unsigned Opcode = N->getOpcode();
7975 switch (Opcode) {
7976 default:
7979 unsigned IID = N->getConstantOperandVal(0);
7980 if (IID < Intrinsic::num_intrinsics)
7981 return IID;
7983 }
7984 }
7985}
7986
7988 SDValue N1) const {
7989 if (!N0.hasOneUse())
7990 return false;
7991
7992 unsigned IID = getIntrinsicID(N1.getNode());
7993 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7994 if (IID == Intrinsic::aarch64_neon_umull ||
7995 N1.getOpcode() == AArch64ISD::UMULL ||
7996 IID == Intrinsic::aarch64_neon_smull ||
7997 N1.getOpcode() == AArch64ISD::SMULL)
7998 return N0.getOpcode() != ISD::ADD;
7999
8000 return true;
8001}
8002
8003/// Selects the correct CCAssignFn for a given CallingConvention value.
8005 bool IsVarArg) const {
8006 switch (CC) {
8007 default:
8008 reportFatalUsageError("unsupported calling convention");
8009 case CallingConv::GHC:
8010 return CC_AArch64_GHC;
8012 // The VarArg implementation makes assumptions about register
8013 // argument passing that do not hold for preserve_none, so we
8014 // instead fall back to C argument passing.
8015 // The non-vararg case is handled in the CC function itself.
8016 if (!IsVarArg)
8018 [[fallthrough]];
8019 case CallingConv::C:
8020 case CallingConv::Fast:
8024 case CallingConv::Swift:
8026 case CallingConv::Tail:
8027 case CallingConv::GRAAL:
8028 if (Subtarget->isTargetWindows()) {
8029 if (IsVarArg) {
8030 if (Subtarget->isWindowsArm64EC())
8033 }
8034 return CC_AArch64_Win64PCS;
8035 }
8036 if (!Subtarget->isTargetDarwin())
8037 return CC_AArch64_AAPCS;
8038 if (!IsVarArg)
8039 return CC_AArch64_DarwinPCS;
8040 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8042 case CallingConv::Win64:
8043 if (IsVarArg) {
8044 if (Subtarget->isWindowsArm64EC())
8047 }
8048 return CC_AArch64_Win64PCS;
8050 if (Subtarget->isWindowsArm64EC())
8058 return CC_AArch64_AAPCS;
8063 }
8064}
8065
8066CCAssignFn *
8068 switch (CC) {
8069 default:
8070 return RetCC_AArch64_AAPCS;
8074 if (Subtarget->isWindowsArm64EC())
8076 return RetCC_AArch64_AAPCS;
8077 }
8078}
8079
8080static bool isPassedInFPR(EVT VT) {
8081 return VT.isFixedLengthVector() ||
8082 (VT.isFloatingPoint() && !VT.isScalableVector());
8083}
8084
8086 AArch64FunctionInfo &FuncInfo,
8087 SelectionDAG &DAG) {
8088 if (!FuncInfo.hasZT0SpillSlotIndex())
8089 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8090
8091 return DAG.getFrameIndex(
8092 FuncInfo.getZT0SpillSlotIndex(),
8094}
8095
8096SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8097 SelectionDAG &DAG) const {
8098 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8099 SDValue Glue = Chain.getValue(1);
8100
8101 MachineFunction &MF = DAG.getMachineFunction();
8102 SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
8103
8104 // The following conditions are true on entry to an exception handler:
8105 // - PSTATE.SM is 0.
8106 // - PSTATE.ZA is 0.
8107 // - TPIDR2_EL0 is null.
8108 // See:
8109 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8110 //
8111 // Therefore, if the function that contains this exception handler is a
8112 // streaming[-compatible] function, we must re-enable streaming mode.
8113 //
8114 // These mode changes are usually optimized away in catch blocks as they
8115 // occur before the __cxa_begin_catch (which is a non-streaming function),
8116 // but are necessary in some cases (such as for cleanups).
8117
8118 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8119 return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8120 /*Glue*/ Glue, AArch64SME::Always);
8121
8122 if (SMEFnAttrs.hasStreamingCompatibleInterface())
8123 return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8125
8126 return Chain;
8127}
8128
8129SDValue AArch64TargetLowering::LowerFormalArguments(
8130 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8131 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8132 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8133 MachineFunction &MF = DAG.getMachineFunction();
8134 const Function &F = MF.getFunction();
8135 MachineFrameInfo &MFI = MF.getFrameInfo();
8136 bool IsWin64 =
8137 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8138 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8139 (isVarArg && Subtarget->isWindowsArm64EC());
8140 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8141
8143 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8145 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8146 FuncInfo->setIsSVECC(true);
8147
8148 // Assign locations to all of the incoming arguments.
8150 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8151
8152 // At this point, Ins[].VT may already be promoted to i32. To correctly
8153 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8154 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8155 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8156 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8157 // LocVT.
8158 unsigned NumArgs = Ins.size();
8159 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8160 unsigned CurArgIdx = 0;
8161 bool UseVarArgCC = false;
8162 if (IsWin64)
8163 UseVarArgCC = isVarArg;
8164
8165 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8166
8167 for (unsigned i = 0; i != NumArgs; ++i) {
8168 MVT ValVT = Ins[i].VT;
8169 if (Ins[i].isOrigArg()) {
8170 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8171 CurArgIdx = Ins[i].getOrigArgIndex();
8172
8173 // Get type of the original argument.
8174 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8175 /*AllowUnknown*/ true);
8176 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8177 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8178 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8179 ValVT = MVT::i8;
8180 else if (ActualMVT == MVT::i16)
8181 ValVT = MVT::i16;
8182 }
8183 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8184 Ins[i].OrigTy, CCInfo);
8185 assert(!Res && "Call operand has unhandled type");
8186 (void)Res;
8187 }
8188
8189 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8190 bool IsLocallyStreaming =
8191 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8192 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8193 SDValue Glue = Chain.getValue(1);
8194
8195 unsigned ExtraArgLocs = 0;
8196 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8197 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8198
8199 if (Ins[i].Flags.isByVal()) {
8200 // Byval is used for HFAs in the PCS, but the system should work in a
8201 // non-compliant manner for larger structs.
8202 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8203 int Size = Ins[i].Flags.getByValSize();
8204 unsigned NumRegs = (Size + 7) / 8;
8205
8206 // FIXME: This works on big-endian for composite byvals, which are the common
8207 // case. It should also work for fundamental types too.
8208 unsigned FrameIdx =
8209 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8210 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8211 InVals.push_back(FrameIdxN);
8212
8213 continue;
8214 }
8215
8216 if (Ins[i].Flags.isSwiftAsync())
8217 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8218
8219 SDValue ArgValue;
8220 if (VA.isRegLoc()) {
8221 // Arguments stored in registers.
8222 EVT RegVT = VA.getLocVT();
8223 const TargetRegisterClass *RC;
8224
8225 if (RegVT == MVT::i32)
8226 RC = &AArch64::GPR32RegClass;
8227 else if (RegVT == MVT::i64)
8228 RC = &AArch64::GPR64RegClass;
8229 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8230 RC = &AArch64::FPR16RegClass;
8231 else if (RegVT == MVT::f32)
8232 RC = &AArch64::FPR32RegClass;
8233 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8234 RC = &AArch64::FPR64RegClass;
8235 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8236 RC = &AArch64::FPR128RegClass;
8237 else if (RegVT.isScalableVector() &&
8238 RegVT.getVectorElementType() == MVT::i1) {
8239 FuncInfo->setIsSVECC(true);
8240 RC = &AArch64::PPRRegClass;
8241 } else if (RegVT == MVT::aarch64svcount) {
8242 FuncInfo->setIsSVECC(true);
8243 RC = &AArch64::PPRRegClass;
8244 } else if (RegVT.isScalableVector()) {
8245 FuncInfo->setIsSVECC(true);
8246 RC = &AArch64::ZPRRegClass;
8247 } else
8248 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8249
8250 // Transform the arguments in physical registers into virtual ones.
8251 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8252
8253 if (IsLocallyStreaming) {
8254 // LocallyStreamingFunctions must insert the SMSTART in the correct
8255 // position, so we use Glue to ensure no instructions can be scheduled
8256 // between the chain of:
8257 // t0: ch,glue = EntryNode
8258 // t1: res,ch,glue = CopyFromReg
8259 // ...
8260 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8261 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8262 // ^^^^^^
8263 // This will be the new Chain/Root node.
8264 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8265 Glue = ArgValue.getValue(2);
8266 if (isPassedInFPR(ArgValue.getValueType())) {
8267 ArgValue =
8268 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8269 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8270 {ArgValue, Glue});
8271 Glue = ArgValue.getValue(1);
8272 }
8273 } else
8274 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8275
8276 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8277 // to 64 bits. Insert an assert[sz]ext to capture this, then
8278 // truncate to the right size.
8279 switch (VA.getLocInfo()) {
8280 default:
8281 llvm_unreachable("Unknown loc info!");
8282 case CCValAssign::Full:
8283 break;
8285 assert(
8286 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8287 "Indirect arguments should be scalable on most subtargets");
8288 break;
8289 case CCValAssign::BCvt:
8290 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8291 break;
8292 case CCValAssign::AExt:
8293 case CCValAssign::SExt:
8294 case CCValAssign::ZExt:
8295 break;
8297 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8298 DAG.getConstant(32, DL, RegVT));
8299 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8300 break;
8301 }
8302 } else { // VA.isRegLoc()
8303 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8304 unsigned ArgOffset = VA.getLocMemOffset();
8305 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8306 ? VA.getLocVT().getSizeInBits()
8307 : VA.getValVT().getSizeInBits()) / 8;
8308
8309 uint32_t BEAlign = 0;
8310 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8311 !Ins[i].Flags.isInConsecutiveRegs())
8312 BEAlign = 8 - ArgSize;
8313
8314 SDValue FIN;
8315 MachinePointerInfo PtrInfo;
8316 if (StackViaX4) {
8317 // In both the ARM64EC varargs convention and the thunk convention,
8318 // arguments on the stack are accessed relative to x4, not sp. In
8319 // the thunk convention, there's an additional offset of 32 bytes
8320 // to account for the shadow store.
8321 unsigned ObjOffset = ArgOffset + BEAlign;
8322 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8323 ObjOffset += 32;
8324 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8325 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8326 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8327 DAG.getConstant(ObjOffset, DL, MVT::i64));
8329 } else {
8330 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8331
8332 // Create load nodes to retrieve arguments from the stack.
8333 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8334 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8335 }
8336
8337 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8339 MVT MemVT = VA.getValVT();
8340
8341 switch (VA.getLocInfo()) {
8342 default:
8343 break;
8344 case CCValAssign::Trunc:
8345 case CCValAssign::BCvt:
8346 MemVT = VA.getLocVT();
8347 break;
8350 Subtarget->isWindowsArm64EC()) &&
8351 "Indirect arguments should be scalable on most subtargets");
8352 MemVT = VA.getLocVT();
8353 break;
8354 case CCValAssign::SExt:
8355 ExtType = ISD::SEXTLOAD;
8356 break;
8357 case CCValAssign::ZExt:
8358 ExtType = ISD::ZEXTLOAD;
8359 break;
8360 case CCValAssign::AExt:
8361 ExtType = ISD::EXTLOAD;
8362 break;
8363 }
8364
8365 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8366 MemVT);
8367 }
8368
8369 if (VA.getLocInfo() == CCValAssign::Indirect) {
8370 assert((VA.getValVT().isScalableVT() ||
8371 Subtarget->isWindowsArm64EC()) &&
8372 "Indirect arguments should be scalable on most subtargets");
8373
8374 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8375 unsigned NumParts = 1;
8376 if (Ins[i].Flags.isInConsecutiveRegs()) {
8377 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8378 ++NumParts;
8379 }
8380
8381 MVT PartLoad = VA.getValVT();
8382 SDValue Ptr = ArgValue;
8383
8384 // Ensure we generate all loads for each tuple part, whilst updating the
8385 // pointer after each load correctly using vscale.
8386 while (NumParts > 0) {
8387 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8388 InVals.push_back(ArgValue);
8389 NumParts--;
8390 if (NumParts > 0) {
8391 SDValue BytesIncrement;
8392 if (PartLoad.isScalableVector()) {
8393 BytesIncrement = DAG.getVScale(
8394 DL, Ptr.getValueType(),
8395 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8396 } else {
8397 BytesIncrement = DAG.getConstant(
8398 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8399 Ptr.getValueType());
8400 }
8401 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8402 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8403 ExtraArgLocs++;
8404 i++;
8405 }
8406 }
8407 } else {
8408 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8409 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8410 ArgValue, DAG.getValueType(MVT::i32));
8411
8412 // i1 arguments are zero-extended to i8 by the caller. Emit a
8413 // hint to reflect this.
8414 if (Ins[i].isOrigArg()) {
8415 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8416 if (OrigArg->getType()->isIntegerTy(1)) {
8417 if (!Ins[i].Flags.isZExt()) {
8418 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8419 ArgValue.getValueType(), ArgValue);
8420 }
8421 }
8422 }
8423
8424 InVals.push_back(ArgValue);
8425 }
8426 }
8427 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8428
8429 if (Attrs.hasStreamingCompatibleInterface()) {
8430 SDValue EntryPStateSM =
8431 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8432 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8433
8434 // Copy the value to a virtual register, and save that in FuncInfo.
8435 Register EntryPStateSMReg =
8436 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8437 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8438 EntryPStateSM);
8439 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8440 }
8441
8442 // Insert the SMSTART if this is a locally streaming function and
8443 // make sure it is Glued to the last CopyFromReg value.
8444 if (IsLocallyStreaming) {
8445 if (Attrs.hasStreamingCompatibleInterface())
8446 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8448 else
8449 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8451
8452 // Ensure that the SMSTART happens after the CopyWithChain such that its
8453 // chain result is used.
8454 for (unsigned I=0; I<InVals.size(); ++I) {
8457 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8458 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8459 InVals[I].getValueType());
8460 }
8461 }
8462
8463 // varargs
8464 if (isVarArg) {
8466 if (!Subtarget->isTargetDarwin() || IsWin64) {
8467 // The AAPCS variadic function ABI is identical to the non-variadic
8468 // one. As a result there may be more arguments in registers and we
8469 // should save them for future reference.
8470 // Win64 variadic functions also pass arguments in registers, but all
8471 // float arguments are passed in integer registers.
8472 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8473 }
8474
8475 // This will point to the next argument passed via stack.
8476 unsigned VarArgsOffset = CCInfo.getStackSize();
8477 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8478 VarArgsOffset =
8479 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8480 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8481 FuncInfo->setVarArgsStackIndex(
8482 MFI.CreateFixedObject(4, VarArgsOffset, true));
8483 }
8484
8485 if (MFI.hasMustTailInVarArgFunc()) {
8486 SmallVector<MVT, 2> RegParmTypes;
8487 RegParmTypes.push_back(MVT::i64);
8488 RegParmTypes.push_back(MVT::f128);
8489 // Compute the set of forwarded registers. The rest are scratch.
8490 SmallVectorImpl<ForwardedRegister> &Forwards =
8491 FuncInfo->getForwardedMustTailRegParms();
8492 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8494
8495 // Conservatively forward X8, since it might be used for aggregate return.
8496 if (!CCInfo.isAllocated(AArch64::X8)) {
8497 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8498 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8499 }
8500 }
8501 }
8502
8503 // On Windows, InReg pointers must be returned, so record the pointer in a
8504 // virtual register at the start of the function so it can be returned in the
8505 // epilogue.
8506 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8507 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8508 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8509 Ins[I].Flags.isInReg()) &&
8510 Ins[I].Flags.isSRet()) {
8511 assert(!FuncInfo->getSRetReturnReg());
8512
8513 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8514 Register Reg =
8516 FuncInfo->setSRetReturnReg(Reg);
8517
8518 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8519 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8520 break;
8521 }
8522 }
8523 }
8524
8525 unsigned StackArgSize = CCInfo.getStackSize();
8526 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8527 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8528 // This is a non-standard ABI so by fiat I say we're allowed to make full
8529 // use of the stack area to be popped, which must be aligned to 16 bytes in
8530 // any case:
8531 StackArgSize = alignTo(StackArgSize, 16);
8532
8533 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8534 // a multiple of 16.
8535 FuncInfo->setArgumentStackToRestore(StackArgSize);
8536
8537 // This realignment carries over to the available bytes below. Our own
8538 // callers will guarantee the space is free by giving an aligned value to
8539 // CALLSEQ_START.
8540 }
8541 // Even if we're not expected to free up the space, it's useful to know how
8542 // much is there while considering tail calls (because we can reuse it).
8543 FuncInfo->setBytesInStackArgArea(StackArgSize);
8544
8545 if (Subtarget->hasCustomCallingConv())
8546 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8547
8548 if (getTM().useNewSMEABILowering()) {
8549 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8550 SDValue Size;
8551 if (Attrs.hasZAState()) {
8552 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8553 DAG.getConstant(1, DL, MVT::i32));
8554 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8555 } else if (Attrs.hasAgnosticZAInterface()) {
8556 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8559 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8560 TargetLowering::CallLoweringInfo CLI(DAG);
8561 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8562 getLibcallCallingConv(LC), RetTy, Callee, {});
8563 std::tie(Size, Chain) = LowerCallTo(CLI);
8564 }
8565 if (Size) {
8566 SDValue Buffer = DAG.getNode(
8567 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8568 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8569 Chain = Buffer.getValue(1);
8570
8571 Register BufferPtr =
8572 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8573 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8574 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8575 DAG.getVTList(MVT::Other), Chain);
8576 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8577 MFI.CreateVariableSizedObject(Align(16), nullptr);
8578 }
8579 }
8580 } else {
8581 // Old SME ABI lowering (deprecated):
8582 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8583 // will be expanded and stored in the static object later using a
8584 // pseudonode.
8585 if (Attrs.hasZAState()) {
8586 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8587 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8588 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8589 DAG.getConstant(1, DL, MVT::i32));
8590 SDValue Buffer;
8591 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8592 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8593 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8594 } else {
8595 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8596 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8597 DAG.getVTList(MVT::i64, MVT::Other),
8598 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8599 MFI.CreateVariableSizedObject(Align(16), nullptr);
8600 }
8601 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8602 DAG.getConstant(1, DL, MVT::i32));
8603 Chain = DAG.getNode(
8604 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8605 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8606 /*Num save slices*/ NumZaSaveSlices});
8607 } else if (Attrs.hasAgnosticZAInterface()) {
8608 // Call __arm_sme_state_size().
8609 SDValue BufferSize =
8610 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8611 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8612 Chain = BufferSize.getValue(1);
8613 SDValue Buffer;
8614 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8615 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8616 DAG.getVTList(MVT::i64, MVT::Other),
8617 {Chain, BufferSize});
8618 } else {
8619 // Allocate space dynamically.
8620 Buffer = DAG.getNode(
8621 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8622 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8623 MFI.CreateVariableSizedObject(Align(16), nullptr);
8624 }
8625 // Copy the value to a virtual register, and save that in FuncInfo.
8626 Register BufferPtr =
8627 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8628 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8629 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
8630 }
8631 }
8632
8633 if (CallConv == CallingConv::PreserveNone) {
8634 for (const ISD::InputArg &I : Ins) {
8635 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8636 I.Flags.isSwiftAsync()) {
8637 MachineFunction &MF = DAG.getMachineFunction();
8638 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8639 MF.getFunction(),
8640 "Swift attributes can't be used with preserve_none",
8641 DL.getDebugLoc()));
8642 break;
8643 }
8644 }
8645 }
8646
8647 if (getTM().useNewSMEABILowering()) {
8648 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8649 if (Attrs.isNewZT0())
8650 Chain = DAG.getNode(
8651 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8652 DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8653 DAG.getTargetConstant(0, DL, MVT::i32));
8654 }
8655
8656 return Chain;
8657}
8658
8659void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8660 SelectionDAG &DAG,
8661 const SDLoc &DL,
8662 SDValue &Chain) const {
8663 MachineFunction &MF = DAG.getMachineFunction();
8664 MachineFrameInfo &MFI = MF.getFrameInfo();
8665 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8666 auto PtrVT = getPointerTy(DAG.getDataLayout());
8667 Function &F = MF.getFunction();
8668 bool IsWin64 =
8669 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8670
8672
8674 unsigned NumGPRArgRegs = GPRArgRegs.size();
8675 if (Subtarget->isWindowsArm64EC()) {
8676 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8677 // functions.
8678 NumGPRArgRegs = 4;
8679 }
8680 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8681
8682 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8683 int GPRIdx = 0;
8684 if (GPRSaveSize != 0) {
8685 if (IsWin64) {
8686 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8687 if (GPRSaveSize & 15)
8688 // The extra size here, if triggered, will always be 8.
8689 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8690 } else
8691 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8692
8693 SDValue FIN;
8694 if (Subtarget->isWindowsArm64EC()) {
8695 // With the Arm64EC ABI, we reserve the save area as usual, but we
8696 // compute its address relative to x4. For a normal AArch64->AArch64
8697 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8698 // different address.
8699 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8700 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8701 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8702 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8703 } else {
8704 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8705 }
8706
8707 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8708 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8709 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8710 SDValue Store =
8711 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8713 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8714 : MachinePointerInfo::getStack(MF, i * 8));
8715 MemOps.push_back(Store);
8716 FIN =
8717 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8718 }
8719 }
8720 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8721 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8722
8723 if (Subtarget->hasFPARMv8() && !IsWin64) {
8725 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8726 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8727
8728 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8729 int FPRIdx = 0;
8730 if (FPRSaveSize != 0) {
8731 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8732
8733 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8734
8735 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8736 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8737 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8738
8739 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8740 MachinePointerInfo::getStack(MF, i * 16));
8741 MemOps.push_back(Store);
8742 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8743 DAG.getConstant(16, DL, PtrVT));
8744 }
8745 }
8746 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8747 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8748 }
8749
8750 if (!MemOps.empty()) {
8751 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8752 }
8753}
8754
8755/// LowerCallResult - Lower the result values of a call into the
8756/// appropriate copies out of appropriate physical registers.
8757SDValue AArch64TargetLowering::LowerCallResult(
8758 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8759 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8760 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8761 SDValue ThisVal, bool RequiresSMChange) const {
8762 DenseMap<unsigned, SDValue> CopiedRegs;
8763 // Copy all of the result registers out of their specified physreg.
8764 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8765 CCValAssign VA = RVLocs[i];
8766
8767 // Pass 'this' value directly from the argument to return value, to avoid
8768 // reg unit interference
8769 if (i == 0 && isThisReturn) {
8770 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8771 "unexpected return calling convention register assignment");
8772 InVals.push_back(ThisVal);
8773 continue;
8774 }
8775
8776 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8777 // allows one use of a physreg per block.
8778 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8779 if (!Val) {
8780 Val =
8781 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8782 Chain = Val.getValue(1);
8783 InGlue = Val.getValue(2);
8784 CopiedRegs[VA.getLocReg()] = Val;
8785 }
8786
8787 switch (VA.getLocInfo()) {
8788 default:
8789 llvm_unreachable("Unknown loc info!");
8790 case CCValAssign::Full:
8791 break;
8792 case CCValAssign::BCvt:
8793 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8794 break;
8796 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8797 DAG.getConstant(32, DL, VA.getLocVT()));
8798 [[fallthrough]];
8799 case CCValAssign::AExt:
8800 [[fallthrough]];
8801 case CCValAssign::ZExt:
8802 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8803 break;
8804 }
8805
8806 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8807 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8808 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8809
8810 InVals.push_back(Val);
8811 }
8812
8813 return Chain;
8814}
8815
8816/// Return true if the calling convention is one that we can guarantee TCO for.
8817static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8818 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8820}
8821
8822/// Return true if we might ever do TCO for calls with this calling convention.
8824 switch (CC) {
8825 case CallingConv::C:
8830 case CallingConv::Swift:
8832 case CallingConv::Tail:
8833 case CallingConv::Fast:
8834 return true;
8835 default:
8836 return false;
8837 }
8838}
8839
8840/// Return true if the call convention supports varargs
8841/// Currently only those that pass varargs like the C
8842/// calling convention does are eligible
8843/// Calling conventions listed in this function must also
8844/// be properly handled in AArch64Subtarget::isCallingConvWin64
8846 switch (CC) {
8847 case CallingConv::C:
8849 // SVE vector call is only partially supported, but it should
8850 // support named arguments being passed. Any arguments being passed
8851 // as varargs, are still unsupported.
8853 return true;
8854 default:
8855 return false;
8856 }
8857}
8858
8860 const AArch64Subtarget *Subtarget,
8862 CCState &CCInfo) {
8863 const SelectionDAG &DAG = CLI.DAG;
8864 CallingConv::ID CalleeCC = CLI.CallConv;
8865 bool IsVarArg = CLI.IsVarArg;
8866 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8867 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8868
8869 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8870 // for the shadow store.
8871 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8872 CCInfo.AllocateStack(32, Align(16));
8873
8874 unsigned NumArgs = Outs.size();
8875 for (unsigned i = 0; i != NumArgs; ++i) {
8876 MVT ArgVT = Outs[i].VT;
8877 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8878
8879 bool UseVarArgCC = false;
8880 if (IsVarArg) {
8881 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8882 // too, so use the vararg CC to force them to integer registers.
8883 if (IsCalleeWin64) {
8884 UseVarArgCC = true;
8885 } else {
8886 UseVarArgCC = ArgFlags.isVarArg();
8887 }
8888 }
8889
8890 if (!UseVarArgCC) {
8891 // Get type of the original argument.
8892 EVT ActualVT =
8893 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8894 /*AllowUnknown*/ true);
8895 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8896 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8897 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8898 ArgVT = MVT::i8;
8899 else if (ActualMVT == MVT::i16)
8900 ArgVT = MVT::i16;
8901 }
8902
8903 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8904 // argument. This logic should exactly mirror LowerFormalArguments.
8905 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8906 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8907 Outs[i].OrigTy, CCInfo);
8908 assert(!Res && "Call operand has unhandled type");
8909 (void)Res;
8910 }
8911}
8912
8913static SMECallAttrs
8916 if (CLI.CB)
8917 return SMECallAttrs(*CLI.CB, &TLI);
8918 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8919 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
8921}
8922
8923bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8924 const CallLoweringInfo &CLI) const {
8925 CallingConv::ID CalleeCC = CLI.CallConv;
8926 if (!mayTailCallThisCC(CalleeCC))
8927 return false;
8928
8929 SDValue Callee = CLI.Callee;
8930 bool IsVarArg = CLI.IsVarArg;
8931 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8932 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8933 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8934 const SelectionDAG &DAG = CLI.DAG;
8935 MachineFunction &MF = DAG.getMachineFunction();
8936 const Function &CallerF = MF.getFunction();
8937 CallingConv::ID CallerCC = CallerF.getCallingConv();
8938
8939 // SME Streaming functions are not eligible for TCO as they may require
8940 // the streaming mode or ZA to be restored after returning from the call.
8941 SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
8942 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
8943 CallAttrs.requiresPreservingAllZAState() ||
8944 CallAttrs.caller().hasStreamingBody())
8945 return false;
8946
8947 // Functions using the C or Fast calling convention that have an SVE signature
8948 // preserve more registers and should assume the SVE_VectorCall CC.
8949 // The check for matching callee-saved regs will determine whether it is
8950 // eligible for TCO.
8951 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8952 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
8954
8955 bool CCMatch = CallerCC == CalleeCC;
8956
8957 // When using the Windows calling convention on a non-windows OS, we want
8958 // to back up and restore X18 in such functions; we can't do a tail call
8959 // from those functions.
8960 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8961 CalleeCC != CallingConv::Win64)
8962 return false;
8963
8964 // Byval parameters hand the function a pointer directly into the stack area
8965 // we want to reuse during a tail call. Working around this *is* possible (see
8966 // X86) but less efficient and uglier in LowerCall.
8967 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8968 e = CallerF.arg_end();
8969 i != e; ++i) {
8970 if (i->hasByValAttr())
8971 return false;
8972
8973 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8974 // In this case, it is necessary to save X0/X1 in the callee and return it
8975 // in X0. Tail call opt may interfere with this, so we disable tail call
8976 // opt when the caller has an "inreg" attribute -- except if the callee
8977 // also has that attribute on the same argument, and the same value is
8978 // passed.
8979 if (i->hasInRegAttr()) {
8980 unsigned ArgIdx = i - CallerF.arg_begin();
8981 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
8982 return false;
8983 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
8984 if (!Attrs.hasAttribute(Attribute::InReg) ||
8985 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
8986 CLI.CB->getArgOperand(ArgIdx) != i) {
8987 return false;
8988 }
8989 }
8990 }
8991
8992 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8993 return CCMatch;
8994
8995 // Externally-defined functions with weak linkage should not be
8996 // tail-called on AArch64 when the OS does not support dynamic
8997 // pre-emption of symbols, as the AAELF spec requires normal calls
8998 // to undefined weak functions to be replaced with a NOP or jump to the
8999 // next instruction. The behaviour of branch instructions in this
9000 // situation (as used for tail calls) is implementation-defined, so we
9001 // cannot rely on the linker replacing the tail call with a return.
9002 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9003 const GlobalValue *GV = G->getGlobal();
9004 const Triple &TT = getTargetMachine().getTargetTriple();
9005 if (GV->hasExternalWeakLinkage() &&
9006 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9007 return false;
9008 }
9009
9010 // Now we search for cases where we can use a tail call without changing the
9011 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9012 // concept.
9013
9014 // I want anyone implementing a new calling convention to think long and hard
9015 // about this assert.
9016 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9017 report_fatal_error("Unsupported variadic calling convention");
9018
9019 LLVMContext &C = *DAG.getContext();
9020 // Check that the call results are passed in the same way.
9021 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9022 CCAssignFnForCall(CalleeCC, IsVarArg),
9023 CCAssignFnForCall(CallerCC, IsVarArg)))
9024 return false;
9025 // The callee has to preserve all registers the caller needs to preserve.
9026 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9027 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9028 if (!CCMatch) {
9029 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9030 if (Subtarget->hasCustomCallingConv()) {
9031 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9032 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9033 }
9034 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9035 return false;
9036 }
9037
9038 // Nothing more to check if the callee is taking no arguments
9039 if (Outs.empty())
9040 return true;
9041
9043 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9044
9045 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9046
9047 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9048 // When we are musttail, additional checks have been done and we can safely ignore this check
9049 // At least two cases here: if caller is fastcc then we can't have any
9050 // memory arguments (we'd be expected to clean up the stack afterwards). If
9051 // caller is C then we could potentially use its argument area.
9052
9053 // FIXME: for now we take the most conservative of these in both cases:
9054 // disallow all variadic memory operands.
9055 for (const CCValAssign &ArgLoc : ArgLocs)
9056 if (!ArgLoc.isRegLoc())
9057 return false;
9058 }
9059
9060 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9061
9062 // If any of the arguments is passed indirectly, it must be SVE, so the
9063 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9064 // allocate space on the stack. That is why we determine this explicitly here
9065 // the call cannot be a tailcall.
9066 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9067 assert((A.getLocInfo() != CCValAssign::Indirect ||
9068 A.getValVT().isScalableVector() ||
9069 Subtarget->isWindowsArm64EC()) &&
9070 "Expected value to be scalable");
9071 return A.getLocInfo() == CCValAssign::Indirect;
9072 }))
9073 return false;
9074
9075 // If the stack arguments for this call do not fit into our own save area then
9076 // the call cannot be made tail.
9077 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9078 return false;
9079
9080 const MachineRegisterInfo &MRI = MF.getRegInfo();
9081 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9082 return false;
9083
9084 return true;
9085}
9086
9087SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9088 SelectionDAG &DAG,
9089 MachineFrameInfo &MFI,
9090 int ClobberedFI) const {
9091 SmallVector<SDValue, 8> ArgChains;
9092 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9093 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9094
9095 // Include the original chain at the beginning of the list. When this is
9096 // used by target LowerCall hooks, this helps legalize find the
9097 // CALLSEQ_BEGIN node.
9098 ArgChains.push_back(Chain);
9099
9100 // Add a chain value for each stack argument corresponding
9101 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9102 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9103 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9104 if (FI->getIndex() < 0) {
9105 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9106 int64_t InLastByte = InFirstByte;
9107 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9108
9109 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9110 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9111 ArgChains.push_back(SDValue(L, 1));
9112 }
9113
9114 // Build a tokenfactor for all the chains.
9115 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9116}
9117
9118bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9119 bool TailCallOpt) const {
9120 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9121 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9122}
9123
9124// Check if the value is zero-extended from i1 to i8
9125static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9126 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9127 if (SizeInBits < 8)
9128 return false;
9129
9130 APInt RequiredZero(SizeInBits, 0xFE);
9131 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9132 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9133 return ZExtBool;
9134}
9135
9136void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9137 SDNode *Node) const {
9138 // Live-in physreg copies that are glued to SMSTART are applied as
9139 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9140 // register allocator to pass call args in callee saved regs, without extra
9141 // copies to avoid these fake clobbers of actually-preserved GPRs.
9142 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9143 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9144 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9145 if (MachineOperand &MO = MI.getOperand(I);
9146 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9147 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9148 AArch64::GPR64RegClass.contains(MO.getReg())))
9149 MI.removeOperand(I);
9150
9151 // The SVE vector length can change when entering/leaving streaming mode.
9152 // FPMR is set to 0 when entering/leaving streaming mode.
9153 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9154 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9155 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9156 /*IsImplicit=*/true));
9157 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9158 /*IsImplicit=*/true));
9159 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9160 /*IsImplicit=*/true));
9161 }
9162 }
9163
9164 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9165 // have nothing to do with VG, were it not that they are used to materialise a
9166 // frame-address. If they contain a frame-index to a scalable vector, this
9167 // will likely require an ADDVL instruction to materialise the address, thus
9168 // reading VG.
9169 const MachineFunction &MF = *MI.getMF();
9170 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9171 (MI.getOpcode() == AArch64::ADDXri ||
9172 MI.getOpcode() == AArch64::SUBXri)) {
9173 const MachineOperand &MO = MI.getOperand(1);
9174 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
9176 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9177 /*IsImplicit=*/true));
9178 }
9179}
9180
9182 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9183 unsigned Condition, bool InsertVectorLengthCheck) const {
9186 FuncInfo->setHasStreamingModeChanges(true);
9187
9188 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9189 SmallVector<SDValue, 2> Ops = {Chain};
9190 if (InGlue)
9191 Ops.push_back(InGlue);
9192 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9193 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9194 };
9195
9196 if (InsertVectorLengthCheck && Enable) {
9197 // Non-streaming -> Streaming
9198 // Insert vector length check before smstart
9199 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9200 Chain = CheckVL.getValue(0);
9201 InGlue = CheckVL.getValue(1);
9202 }
9203
9204 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9205 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9206 SDValue MSROp =
9207 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9208 SmallVector<SDValue> Ops = {Chain, MSROp};
9209 unsigned Opcode;
9210 if (Condition != AArch64SME::Always) {
9211 Register PStateReg = FuncInfo->getPStateSMReg();
9212 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9213 SDValue PStateSM =
9214 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9215 // Use chain and glue from the CopyFromReg.
9216 Ops[0] = PStateSM.getValue(1);
9217 InGlue = PStateSM.getValue(2);
9218 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9219 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9220 Ops.push_back(ConditionOp);
9221 Ops.push_back(PStateSM);
9222 } else {
9223 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9224 }
9225 Ops.push_back(RegMask);
9226
9227 if (InGlue)
9228 Ops.push_back(InGlue);
9229
9230 SDValue SMChange =
9231 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9232
9233 if (!InsertVectorLengthCheck || Enable)
9234 return SMChange;
9235
9236 // Streaming -> Non-streaming
9237 // Insert vector length check after smstop since we cannot read VL
9238 // in streaming mode
9239 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9240}
9241
9242// Emit a call to __arm_sme_save or __arm_sme_restore.
9244 SelectionDAG &DAG,
9246 SDValue Chain, bool IsSave) {
9249 FuncInfo->setSMESaveBufferUsed();
9251 Args.emplace_back(
9252 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
9254
9255 RTLIB::Libcall LC =
9256 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
9257 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
9258 TLI.getPointerTy(DAG.getDataLayout()));
9259 auto *RetTy = Type::getVoidTy(*DAG.getContext());
9261 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9262 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
9263 return TLI.LowerCallTo(CLI).second;
9264}
9265
9268 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9269 CallAttrs.caller().hasStreamingBody())
9270 return AArch64SME::Always;
9271 if (CallAttrs.callee().hasNonStreamingInterface())
9273 if (CallAttrs.callee().hasStreamingInterface())
9275
9276 llvm_unreachable("Unsupported attributes");
9277}
9278
9279/// Check whether a stack argument requires lowering in a tail call.
9281 const CCValAssign &VA, SDValue Arg,
9282 ISD::ArgFlagsTy Flags, int CallOffset) {
9283 // FIXME: We should be able to handle this case, but it's not clear how to.
9284 if (Flags.isZExt() || Flags.isSExt())
9285 return true;
9286
9287 for (;;) {
9288 // Look through nodes that don't alter the bits of the incoming value.
9289 unsigned Op = Arg.getOpcode();
9290 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9291 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9292 Arg = Arg.getOperand(0);
9293 continue;
9294 }
9295 break;
9296 }
9297
9298 // If the argument is a load from the same immutable stack slot, we can reuse
9299 // it.
9300 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9301 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9302 const MachineFrameInfo &MFI = MF.getFrameInfo();
9303 int FI = FINode->getIndex();
9304 if (!MFI.isImmutableObjectIndex(FI))
9305 return true;
9306 if (CallOffset != MFI.getObjectOffset(FI))
9307 return true;
9308 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9309 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9310 return true;
9311 return false;
9312 }
9313 }
9314
9315 return true;
9316}
9317
9318/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9319/// and add input and output parameter nodes.
9320SDValue
9321AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9322 SmallVectorImpl<SDValue> &InVals) const {
9323 SelectionDAG &DAG = CLI.DAG;
9324 SDLoc &DL = CLI.DL;
9325 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9326 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9328 SDValue Chain = CLI.Chain;
9329 SDValue Callee = CLI.Callee;
9330 bool &IsTailCall = CLI.IsTailCall;
9331 CallingConv::ID &CallConv = CLI.CallConv;
9332 bool IsVarArg = CLI.IsVarArg;
9333 const CallBase *CB = CLI.CB;
9334
9335 MachineFunction &MF = DAG.getMachineFunction();
9336 MachineFunction::CallSiteInfo CSInfo;
9337 bool IsThisReturn = false;
9338
9339 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9340 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9341 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9342 bool IsSibCall = false;
9343 bool GuardWithBTI = false;
9344
9345 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9346 !Subtarget->noBTIAtReturnTwice()) {
9347 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9348 }
9349
9350 // Analyze operands of the call, assigning locations to each operand.
9352 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9353
9354 if (IsVarArg) {
9355 unsigned NumArgs = Outs.size();
9356
9357 for (unsigned i = 0; i != NumArgs; ++i) {
9358 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9359 report_fatal_error("Passing SVE types to variadic functions is "
9360 "currently not supported");
9361 }
9362 }
9363
9364 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9365
9366 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9367 // Assign locations to each value returned by this call.
9369 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9370 *DAG.getContext());
9371 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9372
9373 // Set type id for call site info.
9374 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9375 CSInfo = MachineFunction::CallSiteInfo(*CB);
9376
9377 // Check callee args/returns for SVE registers and set calling convention
9378 // accordingly.
9379 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9380 auto HasSVERegLoc = [](CCValAssign &Loc) {
9381 if (!Loc.isRegLoc())
9382 return false;
9383 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9384 AArch64::PPRRegClass.contains(Loc.getLocReg());
9385 };
9386 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9388 }
9389
9390 // Determine whether we need any streaming mode changes.
9391 SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
9392
9393 std::optional<unsigned> ZAMarkerNode;
9394 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9395
9396 if (UseNewSMEABILowering) {
9397 if (CallAttrs.requiresLazySave() ||
9398 CallAttrs.requiresPreservingAllZAState())
9399 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9400 else if (CallAttrs.caller().hasZAState() ||
9401 CallAttrs.caller().hasZT0State())
9402 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9403 }
9404
9405 if (IsTailCall) {
9406 // Check if it's really possible to do a tail call.
9407 IsTailCall = isEligibleForTailCallOptimization(CLI);
9408
9409 // A sibling call is one where we're under the usual C ABI and not planning
9410 // to change that but can still do a tail call:
9411 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9412 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9413 IsSibCall = true;
9414
9415 if (IsTailCall)
9416 ++NumTailCalls;
9417 }
9418
9419 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9420 report_fatal_error("failed to perform tail call elimination on a call "
9421 "site marked musttail");
9422
9423 // Get a count of how many bytes are to be pushed on the stack.
9424 unsigned NumBytes = CCInfo.getStackSize();
9425
9426 if (IsSibCall) {
9427 // Since we're not changing the ABI to make this a tail call, the memory
9428 // operands are already available in the caller's incoming argument space.
9429 NumBytes = 0;
9430 }
9431
9432 // FPDiff is the byte offset of the call's argument area from the callee's.
9433 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9434 // by this amount for a tail call. In a sibling call it must be 0 because the
9435 // caller will deallocate the entire stack and the callee still expects its
9436 // arguments to begin at SP+0. Completely unused for non-tail calls.
9437 int FPDiff = 0;
9438
9439 if (IsTailCall && !IsSibCall) {
9440 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9441
9442 // Since callee will pop argument stack as a tail call, we must keep the
9443 // popped size 16-byte aligned.
9444 NumBytes = alignTo(NumBytes, 16);
9445
9446 // FPDiff will be negative if this tail call requires more space than we
9447 // would automatically have in our incoming argument space. Positive if we
9448 // can actually shrink the stack.
9449 FPDiff = NumReusableBytes - NumBytes;
9450
9451 // Update the required reserved area if this is the tail call requiring the
9452 // most argument stack space.
9453 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9454 FuncInfo->setTailCallReservedStack(-FPDiff);
9455
9456 // The stack pointer must be 16-byte aligned at all times it's used for a
9457 // memory operation, which in practice means at *all* times and in
9458 // particular across call boundaries. Therefore our own arguments started at
9459 // a 16-byte aligned SP and the delta applied for the tail call should
9460 // satisfy the same constraint.
9461 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9462 }
9463
9464 auto DescribeCallsite =
9465 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9466 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9467 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9468 R << ore::NV("Callee", ES->getSymbol());
9469 else if (CLI.CB && CLI.CB->getCalledFunction())
9470 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9471 else
9472 R << "unknown callee";
9473 R << "'";
9474 return R;
9475 };
9476
9477 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9478 bool RequiresSaveAllZA =
9479 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9480 if (RequiresLazySave) {
9481 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9482 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9483 TPIDR2.FrameIndex,
9485 Chain = DAG.getNode(
9486 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9487 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9488 TPIDR2ObjAddr);
9489 OptimizationRemarkEmitter ORE(&MF.getFunction());
9490 ORE.emit([&]() {
9491 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9492 CLI.CB)
9493 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9494 &MF.getFunction());
9495 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9496 });
9497 } else if (RequiresSaveAllZA) {
9498 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9499 "Cannot share state that may not exist");
9500 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9501 /*IsSave=*/true);
9502 }
9503
9504 bool RequiresSMChange = CallAttrs.requiresSMChange();
9505 if (RequiresSMChange) {
9506 OptimizationRemarkEmitter ORE(&MF.getFunction());
9507 ORE.emit([&]() {
9508 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9509 CLI.CB)
9510 : OptimizationRemarkAnalysis("sme", "SMETransition",
9511 &MF.getFunction());
9512 DescribeCallsite(R) << " requires a streaming mode transition";
9513 return R;
9514 });
9515 }
9516
9517 SDValue ZTFrameIdx;
9518 MachineFrameInfo &MFI = MF.getFrameInfo();
9519 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9520
9521 // If the caller has ZT0 state which will not be preserved by the callee,
9522 // spill ZT0 before the call.
9523 if (ShouldPreserveZT0) {
9524 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9525
9526 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9527 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9528 }
9529
9530 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9531 // PSTATE.ZA before the call if there is no lazy-save active.
9532 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9533 assert((!DisableZA || !RequiresLazySave) &&
9534 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9535
9536 if (DisableZA)
9537 Chain = DAG.getNode(
9538 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9539 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9540
9541 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9542 // These operations are automatically eliminated by the prolog/epilog pass
9543 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9544 if (!IsSibCall) {
9545 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9546 if (ZAMarkerNode) {
9547 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9548 // using a chain can result in incorrect scheduling. The markers refer to
9549 // the position just before the CALLSEQ_START (though occur after as
9550 // CALLSEQ_START lacks in-glue).
9551 Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
9552 {Chain, Chain.getValue(1)});
9553 }
9554 }
9555
9556 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9558
9560 SmallSet<unsigned, 8> RegsUsed;
9561 SmallVector<SDValue, 8> MemOpChains;
9562 auto PtrVT = getPointerTy(DAG.getDataLayout());
9563
9564 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9565 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9566 for (const auto &F : Forwards) {
9567 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9568 RegsToPass.emplace_back(F.PReg, Val);
9569 }
9570 }
9571
9572 // Walk the register/memloc assignments, inserting copies/loads.
9573 unsigned ExtraArgLocs = 0;
9574 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9575 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9576 SDValue Arg = OutVals[i];
9577 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9578
9579 // Promote the value if needed.
9580 switch (VA.getLocInfo()) {
9581 default:
9582 llvm_unreachable("Unknown loc info!");
9583 case CCValAssign::Full:
9584 break;
9585 case CCValAssign::SExt:
9586 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9587 break;
9588 case CCValAssign::ZExt:
9589 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9590 break;
9591 case CCValAssign::AExt:
9592 if (Outs[i].ArgVT == MVT::i1) {
9593 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9594 //
9595 // Check if we actually have to do this, because the value may
9596 // already be zero-extended.
9597 //
9598 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9599 // and rely on DAGCombiner to fold this, because the following
9600 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9601 //
9602 // (ext (zext x)) -> (zext x)
9603 //
9604 // This will give us (zext i32), which we cannot remove, so
9605 // try to check this beforehand.
9606 if (!checkZExtBool(Arg, DAG)) {
9607 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9608 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9609 }
9610 }
9611 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9612 break;
9614 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9615 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9616 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9617 DAG.getConstant(32, DL, VA.getLocVT()));
9618 break;
9619 case CCValAssign::BCvt:
9620 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9621 break;
9622 case CCValAssign::Trunc:
9623 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9624 break;
9625 case CCValAssign::FPExt:
9626 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9627 break;
9629 bool isScalable = VA.getValVT().isScalableVT();
9630 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9631 "Indirect arguments should be scalable on most subtargets");
9632
9633 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9634 uint64_t PartSize = StoreSize;
9635 unsigned NumParts = 1;
9636 if (Outs[i].Flags.isInConsecutiveRegs()) {
9637 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9638 ++NumParts;
9639 StoreSize *= NumParts;
9640 }
9641
9642 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9643 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9644 MachineFrameInfo &MFI = MF.getFrameInfo();
9645 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9646 if (isScalable)
9648
9649 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9652 SDValue SpillSlot = Ptr;
9653
9654 // Ensure we generate all stores for each tuple part, whilst updating the
9655 // pointer after each store correctly using vscale.
9656 while (NumParts) {
9657 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9658 MemOpChains.push_back(Store);
9659
9660 NumParts--;
9661 if (NumParts > 0) {
9662 SDValue BytesIncrement;
9663 if (isScalable) {
9664 BytesIncrement = DAG.getVScale(
9665 DL, Ptr.getValueType(),
9666 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9667 } else {
9668 BytesIncrement = DAG.getConstant(
9669 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9670 Ptr.getValueType());
9671 }
9672 MPI = MachinePointerInfo(MPI.getAddrSpace());
9673 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9674 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9675 ExtraArgLocs++;
9676 i++;
9677 }
9678 }
9679
9680 Arg = SpillSlot;
9681 break;
9682 }
9683
9684 if (VA.isRegLoc()) {
9685 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9686 Outs[0].VT == MVT::i64) {
9687 assert(VA.getLocVT() == MVT::i64 &&
9688 "unexpected calling convention register assignment");
9689 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9690 "unexpected use of 'returned'");
9691 IsThisReturn = true;
9692 }
9693 if (RegsUsed.count(VA.getLocReg())) {
9694 // If this register has already been used then we're trying to pack
9695 // parts of an [N x i32] into an X-register. The extension type will
9696 // take care of putting the two halves in the right place but we have to
9697 // combine them.
9698 SDValue &Bits =
9699 llvm::find_if(RegsToPass,
9700 [=](const std::pair<unsigned, SDValue> &Elt) {
9701 return Elt.first == VA.getLocReg();
9702 })
9703 ->second;
9704 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9705 // Call site info is used for function's parameter entry value
9706 // tracking. For now we track only simple cases when parameter
9707 // is transferred through whole register.
9709 [&VA](MachineFunction::ArgRegPair ArgReg) {
9710 return ArgReg.Reg == VA.getLocReg();
9711 });
9712 } else {
9713 // Add an extra level of indirection for streaming mode changes by
9714 // using a pseudo copy node that cannot be rematerialised between a
9715 // smstart/smstop and the call by the simple register coalescer.
9716 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9717 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9718 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9719 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9720 RegsUsed.insert(VA.getLocReg());
9721 const TargetOptions &Options = DAG.getTarget().Options;
9722 if (Options.EmitCallSiteInfo)
9723 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9724 }
9725 } else {
9726 assert(VA.isMemLoc());
9727
9728 SDValue DstAddr;
9729 MachinePointerInfo DstInfo;
9730
9731 // FIXME: This works on big-endian for composite byvals, which are the
9732 // common case. It should also work for fundamental types too.
9733 uint32_t BEAlign = 0;
9734 unsigned OpSize;
9735 if (VA.getLocInfo() == CCValAssign::Indirect ||
9737 OpSize = VA.getLocVT().getFixedSizeInBits();
9738 else
9739 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9740 : VA.getValVT().getSizeInBits();
9741 OpSize = (OpSize + 7) / 8;
9742 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9743 !Flags.isInConsecutiveRegs()) {
9744 if (OpSize < 8)
9745 BEAlign = 8 - OpSize;
9746 }
9747 unsigned LocMemOffset = VA.getLocMemOffset();
9748 int32_t Offset = LocMemOffset + BEAlign;
9749
9750 if (IsTailCall) {
9751 // When the frame pointer is perfectly aligned for the tail call and the
9752 // same stack argument is passed down intact, we can reuse it.
9753 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9754 continue;
9755
9756 Offset = Offset + FPDiff;
9757 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9758
9759 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9760 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9761
9762 // Make sure any stack arguments overlapping with where we're storing
9763 // are loaded before this eventual operation. Otherwise they'll be
9764 // clobbered.
9765 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9766 } else {
9767 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9768
9769 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9770 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9771 }
9772
9773 if (Outs[i].Flags.isByVal()) {
9774 SDValue SizeNode =
9775 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9776 SDValue Cpy = DAG.getMemcpy(
9777 Chain, DL, DstAddr, Arg, SizeNode,
9778 Outs[i].Flags.getNonZeroByValAlign(),
9779 /*isVol = */ false, /*AlwaysInline = */ false,
9780 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9781
9782 MemOpChains.push_back(Cpy);
9783 } else {
9784 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9785 // promoted to a legal register type i32, we should truncate Arg back to
9786 // i1/i8/i16.
9787 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9788 VA.getValVT() == MVT::i16)
9789 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9790
9791 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9792 MemOpChains.push_back(Store);
9793 }
9794 }
9795 }
9796
9797 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9798 !(CLI.CB && CLI.CB->isMustTailCall())) {
9799 SDValue ParamPtr = StackPtr;
9800 if (IsTailCall) {
9801 // Create a dummy object at the top of the stack that can be used to get
9802 // the SP after the epilogue
9803 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9804 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9805 }
9806
9807 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9808 // describing the argument list. x4 contains the address of the
9809 // first stack parameter. x5 contains the size in bytes of all parameters
9810 // passed on the stack.
9811 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9812 RegsToPass.emplace_back(AArch64::X5,
9813 DAG.getConstant(NumBytes, DL, MVT::i64));
9814 }
9815
9816 if (!MemOpChains.empty())
9817 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9818
9819 SDValue InGlue;
9820 if (RequiresSMChange) {
9821 bool InsertVectorLengthCheck =
9823 Chain = changeStreamingMode(
9824 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9825 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
9826 InGlue = Chain.getValue(1);
9827 }
9828
9829 // Build a sequence of copy-to-reg nodes chained together with token chain
9830 // and flag operands which copy the outgoing args into the appropriate regs.
9831 for (auto &RegToPass : RegsToPass) {
9832 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9833 RegToPass.second, InGlue);
9834 InGlue = Chain.getValue(1);
9835 }
9836
9837 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9838 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9839 // node so that legalize doesn't hack it.
9840 const GlobalValue *CalledGlobal = nullptr;
9841 unsigned OpFlags = 0;
9842 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9843 CalledGlobal = G->getGlobal();
9844 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9846 if (OpFlags & AArch64II::MO_GOT) {
9847 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9848 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9849 } else {
9850 const GlobalValue *GV = G->getGlobal();
9851 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9852 }
9853 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9854 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9855 Subtarget->isTargetMachO()) ||
9857 const char *Sym = S->getSymbol();
9858 if (UseGot) {
9860 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9861 } else {
9862 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9863 }
9864 }
9865
9866 // We don't usually want to end the call-sequence here because we would tidy
9867 // the frame up *after* the call, however in the ABI-changing tail-call case
9868 // we've carefully laid out the parameters so that when sp is reset they'll be
9869 // in the correct location.
9870 if (IsTailCall && !IsSibCall) {
9871 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9872 InGlue = Chain.getValue(1);
9873 }
9874
9875 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9876
9877 std::vector<SDValue> Ops;
9878 Ops.push_back(Chain);
9879 Ops.push_back(Callee);
9880
9881 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9882 // be expanded to the call, directly followed by a special marker sequence and
9883 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9884 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9885 assert(!IsTailCall &&
9886 "tail calls cannot be marked with clang.arc.attachedcall");
9887 Opc = AArch64ISD::CALL_RVMARKER;
9888
9889 // Add a target global address for the retainRV/claimRV runtime function
9890 // just before the call target.
9891 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9892 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9893 Ops.insert(Ops.begin() + 1, GA);
9894
9895 // We may or may not need to emit both the marker and the retain/claim call.
9896 // Tell the pseudo expansion using an additional boolean op.
9897 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9898 SDValue DoEmitMarker =
9899 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9900 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9901 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9902 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9903 } else if (GuardWithBTI) {
9904 Opc = AArch64ISD::CALL_BTI;
9905 }
9906
9907 if (IsTailCall) {
9908 // Each tail call may have to adjust the stack by a different amount, so
9909 // this information must travel along with the operation for eventual
9910 // consumption by emitEpilogue.
9911 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9912 }
9913
9914 if (CLI.PAI) {
9915 const uint64_t Key = CLI.PAI->Key;
9917 "Invalid auth call key");
9918
9919 // Split the discriminator into address/integer components.
9920 SDValue AddrDisc, IntDisc;
9921 std::tie(IntDisc, AddrDisc) =
9922 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9923
9924 if (Opc == AArch64ISD::CALL_RVMARKER)
9925 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9926 else
9927 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9928 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9929 Ops.push_back(IntDisc);
9930 Ops.push_back(AddrDisc);
9931 }
9932
9933 // Add argument registers to the end of the list so that they are known live
9934 // into the call.
9935 for (auto &RegToPass : RegsToPass)
9936 Ops.push_back(DAG.getRegister(RegToPass.first,
9937 RegToPass.second.getValueType()));
9938
9939 // Add a register mask operand representing the call-preserved registers.
9940 const uint32_t *Mask;
9941 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9942 if (IsThisReturn) {
9943 // For 'this' returns, use the X0-preserving mask if applicable
9944 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9945 if (!Mask) {
9946 IsThisReturn = false;
9947 Mask = TRI->getCallPreservedMask(MF, CallConv);
9948 }
9949 } else
9950 Mask = TRI->getCallPreservedMask(MF, CallConv);
9951
9952 if (Subtarget->hasCustomCallingConv())
9953 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9954
9955 if (TRI->isAnyArgRegReserved(MF))
9956 TRI->emitReservedArgRegCallError(MF);
9957
9958 assert(Mask && "Missing call preserved mask for calling convention");
9959 Ops.push_back(DAG.getRegisterMask(Mask));
9960
9961 if (InGlue.getNode())
9962 Ops.push_back(InGlue);
9963
9964 // If we're doing a tall call, use a TC_RETURN here rather than an
9965 // actual call instruction.
9966 if (IsTailCall) {
9968 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9969 if (IsCFICall)
9970 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9971
9972 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9973 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9974 if (CalledGlobal &&
9975 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9976 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9977 return Ret;
9978 }
9979
9980 // Returns a chain and a flag for retval copy to use.
9981 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9982 if (IsCFICall)
9983 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9984
9985 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9986 InGlue = Chain.getValue(1);
9987 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9988 if (CalledGlobal &&
9989 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9990 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9991
9992 uint64_t CalleePopBytes =
9993 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9994
9995 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9996 InGlue = Chain.getValue(1);
9997
9998 // Handle result values, copying them out of physregs into vregs that we
9999 // return.
10000 SDValue Result = LowerCallResult(
10001 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10002 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10003
10004 if (!Ins.empty())
10005 InGlue = Result.getValue(Result->getNumValues() - 1);
10006
10007 if (RequiresSMChange) {
10009 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10010 getSMToggleCondition(CallAttrs));
10011 }
10012
10013 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
10014 // Unconditionally resume ZA.
10015 Result = DAG.getNode(
10016 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10017 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10018
10019 if (ShouldPreserveZT0)
10020 Result =
10021 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10022 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10023
10024 if (RequiresLazySave) {
10025 // Conditionally restore the lazy save using a pseudo node.
10026 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
10027 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
10028 SDValue RegMask = DAG.getRegisterMask(
10029 TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
10030 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
10032 SDValue TPIDR2_EL0 = DAG.getNode(
10033 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
10034 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
10035 // Copy the address of the TPIDR2 block into X0 before 'calling' the
10036 // RESTORE_ZA pseudo.
10037 SDValue Glue;
10038 SDValue TPIDR2Block = DAG.getFrameIndex(
10039 TPIDR2.FrameIndex,
10041 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
10042 Result =
10043 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
10044 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
10045 RestoreRoutine, RegMask, Result.getValue(1)});
10046 // Finally reset the TPIDR2_EL0 register to 0.
10047 Result = DAG.getNode(
10048 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
10049 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
10050 DAG.getConstant(0, DL, MVT::i64));
10051 TPIDR2.Uses++;
10052 } else if (RequiresSaveAllZA) {
10053 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10054 /*IsSave=*/false);
10055 }
10056
10057 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10058 RequiresSaveAllZA) {
10059 for (unsigned I = 0; I < InVals.size(); ++I) {
10060 // The smstart/smstop is chained as part of the call, but when the
10061 // resulting chain is discarded (which happens when the call is not part
10062 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10063 // smstart/smstop is chained to the result value. We can do that by doing
10064 // a vreg -> vreg copy.
10067 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10068 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10069 InVals[I].getValueType());
10070 }
10071 }
10072
10073 if (CallConv == CallingConv::PreserveNone) {
10074 for (const ISD::OutputArg &O : Outs) {
10075 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10076 O.Flags.isSwiftAsync()) {
10077 MachineFunction &MF = DAG.getMachineFunction();
10078 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10079 MF.getFunction(),
10080 "Swift attributes can't be used with preserve_none",
10081 DL.getDebugLoc()));
10082 break;
10083 }
10084 }
10085 }
10086
10087 return Result;
10088}
10089
10090bool AArch64TargetLowering::CanLowerReturn(
10091 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10093 const Type *RetTy) const {
10094 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10096 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10097 return CCInfo.CheckReturn(Outs, RetCC);
10098}
10099
10100SDValue
10101AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10102 bool isVarArg,
10104 const SmallVectorImpl<SDValue> &OutVals,
10105 const SDLoc &DL, SelectionDAG &DAG) const {
10106 auto &MF = DAG.getMachineFunction();
10107 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10108
10109 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10111 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10112 CCInfo.AnalyzeReturn(Outs, RetCC);
10113
10114 // Copy the result values into the output registers.
10115 SDValue Glue;
10117 SmallSet<unsigned, 4> RegsUsed;
10118 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10119 ++i, ++realRVLocIdx) {
10120 CCValAssign &VA = RVLocs[i];
10121 assert(VA.isRegLoc() && "Can only return in registers!");
10122 SDValue Arg = OutVals[realRVLocIdx];
10123
10124 switch (VA.getLocInfo()) {
10125 default:
10126 llvm_unreachable("Unknown loc info!");
10127 case CCValAssign::Full:
10128 if (Outs[i].ArgVT == MVT::i1) {
10129 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10130 // value. This is strictly redundant on Darwin (which uses "zeroext
10131 // i1"), but will be optimised out before ISel.
10132 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10133 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10134 }
10135 break;
10136 case CCValAssign::BCvt:
10137 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10138 break;
10139 case CCValAssign::AExt:
10140 case CCValAssign::ZExt:
10141 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10142 break;
10144 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10145 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10146 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10147 DAG.getConstant(32, DL, VA.getLocVT()));
10148 break;
10149 }
10150
10151 if (RegsUsed.count(VA.getLocReg())) {
10152 SDValue &Bits =
10153 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10154 return Elt.first == VA.getLocReg();
10155 })->second;
10156 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10157 } else {
10158 RetVals.emplace_back(VA.getLocReg(), Arg);
10159 RegsUsed.insert(VA.getLocReg());
10160 }
10161 }
10162
10163 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10164
10165 // Emit SMSTOP before returning from a locally streaming function
10166 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10167 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10168 if (FuncAttrs.hasStreamingCompatibleInterface())
10169 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10170 /*Glue*/ SDValue(),
10172 else
10173 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10174 /*Glue*/ SDValue(), AArch64SME::Always);
10175 Glue = Chain.getValue(1);
10176 }
10177
10178 SmallVector<SDValue, 4> RetOps(1, Chain);
10179 for (auto &RetVal : RetVals) {
10180 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10181 isPassedInFPR(RetVal.second.getValueType()))
10182 RetVal.second =
10183 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10184 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10185 RetVal.second);
10186 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10187 Glue = Chain.getValue(1);
10188 RetOps.push_back(
10189 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10190 }
10191
10192 // Windows AArch64 ABIs require that for returning structs by value we copy
10193 // the sret argument into X0 for the return.
10194 // We saved the argument into a virtual register in the entry block,
10195 // so now we copy the value out and into X0.
10196 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10197 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10199
10200 unsigned RetValReg = AArch64::X0;
10201 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10202 RetValReg = AArch64::X8;
10203 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10204 Glue = Chain.getValue(1);
10205
10206 RetOps.push_back(
10207 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10208 }
10209
10210 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10211 if (I) {
10212 for (; *I; ++I) {
10213 if (AArch64::GPR64RegClass.contains(*I))
10214 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10215 else if (AArch64::FPR64RegClass.contains(*I))
10216 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10217 else
10218 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10219 }
10220 }
10221
10222 RetOps[0] = Chain; // Update chain.
10223
10224 // Add the glue if we have it.
10225 if (Glue.getNode())
10226 RetOps.push_back(Glue);
10227
10228 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10229 // ARM64EC entry thunks use a special return sequence: instead of a regular
10230 // "ret" instruction, they need to explicitly call the emulator.
10231 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10232 SDValue Arm64ECRetDest =
10233 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10234 Arm64ECRetDest =
10235 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10236 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10237 MachinePointerInfo());
10238 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10239 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10240 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10241 }
10242
10243 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10244}
10245
10246//===----------------------------------------------------------------------===//
10247// Other Lowering Code
10248//===----------------------------------------------------------------------===//
10249
10250SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10251 SelectionDAG &DAG,
10252 unsigned Flag) const {
10253 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10254 N->getOffset(), Flag);
10255}
10256
10257SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10258 SelectionDAG &DAG,
10259 unsigned Flag) const {
10260 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10261}
10262
10263SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10264 SelectionDAG &DAG,
10265 unsigned Flag) const {
10266 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10267 N->getOffset(), Flag);
10268}
10269
10270SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10271 SelectionDAG &DAG,
10272 unsigned Flag) const {
10273 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10274}
10275
10276SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10277 SelectionDAG &DAG,
10278 unsigned Flag) const {
10279 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10280}
10281
10282// (loadGOT sym)
10283template <class NodeTy>
10284SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10285 unsigned Flags) const {
10286 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10287 SDLoc DL(N);
10288 EVT Ty = getPointerTy(DAG.getDataLayout());
10289 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10290 // FIXME: Once remat is capable of dealing with instructions with register
10291 // operands, expand this into two nodes instead of using a wrapper node.
10292 if (DAG.getMachineFunction()
10293 .getInfo<AArch64FunctionInfo>()
10294 ->hasELFSignedGOT())
10295 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10296 0);
10297 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10298}
10299
10300// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10301template <class NodeTy>
10302SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10303 unsigned Flags) const {
10304 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10305 SDLoc DL(N);
10306 EVT Ty = getPointerTy(DAG.getDataLayout());
10307 const unsigned char MO_NC = AArch64II::MO_NC;
10308 return DAG.getNode(
10309 AArch64ISD::WrapperLarge, DL, Ty,
10310 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10311 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10312 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10313 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10314}
10315
10316// (addlow (adrp %hi(sym)) %lo(sym))
10317template <class NodeTy>
10318SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10319 unsigned Flags) const {
10320 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10321 SDLoc DL(N);
10322 EVT Ty = getPointerTy(DAG.getDataLayout());
10323 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10324 SDValue Lo = getTargetNode(N, Ty, DAG,
10326 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10327 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10328}
10329
10330// (adr sym)
10331template <class NodeTy>
10332SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10333 unsigned Flags) const {
10334 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10335 SDLoc DL(N);
10336 EVT Ty = getPointerTy(DAG.getDataLayout());
10337 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10338 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10339}
10340
10341SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10342 SelectionDAG &DAG) const {
10343 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10344 const GlobalValue *GV = GN->getGlobal();
10345 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10346
10347 if (OpFlags != AArch64II::MO_NO_FLAG)
10349 "unexpected offset in global node");
10350
10351 // This also catches the large code model case for Darwin, and tiny code
10352 // model with got relocations.
10353 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10354 return getGOT(GN, DAG, OpFlags);
10355 }
10356
10360 Result = getAddrLarge(GN, DAG, OpFlags);
10361 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10362 Result = getAddrTiny(GN, DAG, OpFlags);
10363 } else {
10364 Result = getAddr(GN, DAG, OpFlags);
10365 }
10366 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10367 SDLoc DL(GN);
10369 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10371 return Result;
10372}
10373
10374/// Convert a TLS address reference into the correct sequence of loads
10375/// and calls to compute the variable's address (for Darwin, currently) and
10376/// return an SDValue containing the final node.
10377
10378/// Darwin only has one TLS scheme which must be capable of dealing with the
10379/// fully general situation, in the worst case. This means:
10380/// + "extern __thread" declaration.
10381/// + Defined in a possibly unknown dynamic library.
10382///
10383/// The general system is that each __thread variable has a [3 x i64] descriptor
10384/// which contains information used by the runtime to calculate the address. The
10385/// only part of this the compiler needs to know about is the first xword, which
10386/// contains a function pointer that must be called with the address of the
10387/// entire descriptor in "x0".
10388///
10389/// Since this descriptor may be in a different unit, in general even the
10390/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10391/// is:
10392/// adrp x0, _var@TLVPPAGE
10393/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10394/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10395/// ; the function pointer
10396/// blr x1 ; Uses descriptor address in x0
10397/// ; Address of _var is now in x0.
10398///
10399/// If the address of _var's descriptor *is* known to the linker, then it can
10400/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10401/// a slight efficiency gain.
10402SDValue
10403AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10404 SelectionDAG &DAG) const {
10405 assert(Subtarget->isTargetDarwin() &&
10406 "This function expects a Darwin target");
10407
10408 SDLoc DL(Op);
10409 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10410 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10411 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10412
10413 SDValue TLVPAddr =
10414 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10415 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10416
10417 // The first entry in the descriptor is a function pointer that we must call
10418 // to obtain the address of the variable.
10419 SDValue Chain = DAG.getEntryNode();
10420 SDValue FuncTLVGet = DAG.getLoad(
10421 PtrMemVT, DL, Chain, DescAddr,
10423 Align(PtrMemVT.getSizeInBits() / 8),
10425 Chain = FuncTLVGet.getValue(1);
10426
10427 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10428 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10429
10430 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10431 MFI.setAdjustsStack(true);
10432
10433 // TLS calls preserve all registers except those that absolutely must be
10434 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10435 // silly).
10436 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10437 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10438 if (Subtarget->hasCustomCallingConv())
10439 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10440
10441 // Finally, we can make the call. This is just a degenerate version of a
10442 // normal AArch64 call node: x0 takes the address of the descriptor, and
10443 // returns the address of the variable in this thread.
10444 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10445
10446 unsigned Opcode = AArch64ISD::CALL;
10448 Ops.push_back(Chain);
10449 Ops.push_back(FuncTLVGet);
10450
10451 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10452 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10453 Opcode = AArch64ISD::AUTH_CALL;
10454 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10455 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10456 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10457 }
10458
10459 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10460 Ops.push_back(DAG.getRegisterMask(Mask));
10461 Ops.push_back(Chain.getValue(1));
10462 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10463 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10464}
10465
10466/// Convert a thread-local variable reference into a sequence of instructions to
10467/// compute the variable's address for the local exec TLS model of ELF targets.
10468/// The sequence depends on the maximum TLS area size.
10469SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10470 SDValue ThreadBase,
10471 const SDLoc &DL,
10472 SelectionDAG &DAG) const {
10473 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10474 SDValue TPOff, Addr;
10475
10476 switch (DAG.getTarget().Options.TLSSize) {
10477 default:
10478 llvm_unreachable("Unexpected TLS size");
10479
10480 case 12: {
10481 // mrs x0, TPIDR_EL0
10482 // add x0, x0, :tprel_lo12:a
10484 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10485 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10486 Var,
10487 DAG.getTargetConstant(0, DL, MVT::i32)),
10488 0);
10489 }
10490
10491 case 24: {
10492 // mrs x0, TPIDR_EL0
10493 // add x0, x0, :tprel_hi12:a
10494 // add x0, x0, :tprel_lo12_nc:a
10495 SDValue HiVar = DAG.getTargetGlobalAddress(
10496 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10497 SDValue LoVar = DAG.getTargetGlobalAddress(
10498 GV, DL, PtrVT, 0,
10500 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10501 HiVar,
10502 DAG.getTargetConstant(0, DL, MVT::i32)),
10503 0);
10504 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10505 LoVar,
10506 DAG.getTargetConstant(0, DL, MVT::i32)),
10507 0);
10508 }
10509
10510 case 32: {
10511 // mrs x1, TPIDR_EL0
10512 // movz x0, #:tprel_g1:a
10513 // movk x0, #:tprel_g0_nc:a
10514 // add x0, x1, x0
10515 SDValue HiVar = DAG.getTargetGlobalAddress(
10516 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10517 SDValue LoVar = DAG.getTargetGlobalAddress(
10518 GV, DL, PtrVT, 0,
10520 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10521 DAG.getTargetConstant(16, DL, MVT::i32)),
10522 0);
10523 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10524 DAG.getTargetConstant(0, DL, MVT::i32)),
10525 0);
10526 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10527 }
10528
10529 case 48: {
10530 // mrs x1, TPIDR_EL0
10531 // movz x0, #:tprel_g2:a
10532 // movk x0, #:tprel_g1_nc:a
10533 // movk x0, #:tprel_g0_nc:a
10534 // add x0, x1, x0
10535 SDValue HiVar = DAG.getTargetGlobalAddress(
10536 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10537 SDValue MiVar = DAG.getTargetGlobalAddress(
10538 GV, DL, PtrVT, 0,
10540 SDValue LoVar = DAG.getTargetGlobalAddress(
10541 GV, DL, PtrVT, 0,
10543 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10544 DAG.getTargetConstant(32, DL, MVT::i32)),
10545 0);
10546 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10547 DAG.getTargetConstant(16, DL, MVT::i32)),
10548 0);
10549 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10550 DAG.getTargetConstant(0, DL, MVT::i32)),
10551 0);
10552 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10553 }
10554 }
10555}
10556
10557/// When accessing thread-local variables under either the general-dynamic or
10558/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10559/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10560/// is a function pointer to carry out the resolution.
10561///
10562/// The sequence is:
10563/// adrp x0, :tlsdesc:var
10564/// ldr x1, [x0, #:tlsdesc_lo12:var]
10565/// add x0, x0, #:tlsdesc_lo12:var
10566/// .tlsdesccall var
10567/// blr x1
10568/// (TPIDR_EL0 offset now in x0)
10569///
10570/// The above sequence must be produced unscheduled, to enable the linker to
10571/// optimize/relax this sequence.
10572/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10573/// above sequence, and expanded really late in the compilation flow, to ensure
10574/// the sequence is produced as per above.
10575SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10576 const SDLoc &DL,
10577 SelectionDAG &DAG) const {
10578 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10579
10580 SDValue Chain = DAG.getEntryNode();
10581 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10582
10583 unsigned Opcode =
10584 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10585 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10586 : AArch64ISD::TLSDESC_CALLSEQ;
10587 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10588 SDValue Glue = Chain.getValue(1);
10589
10590 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10591}
10592
10593SDValue
10594AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10595 SelectionDAG &DAG) const {
10596 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10597
10598 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10599 AArch64FunctionInfo *MFI =
10600 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10601
10605
10607 if (Model == TLSModel::LocalDynamic)
10609 }
10610
10612 Model != TLSModel::LocalExec)
10613 report_fatal_error("ELF TLS only supported in small memory model or "
10614 "in local exec TLS model");
10615 // Different choices can be made for the maximum size of the TLS area for a
10616 // module. For the small address model, the default TLS size is 16MiB and the
10617 // maximum TLS size is 4GiB.
10618 // FIXME: add tiny and large code model support for TLS access models other
10619 // than local exec. We currently generate the same code as small for tiny,
10620 // which may be larger than needed.
10621
10622 SDValue TPOff;
10623 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10624 SDLoc DL(Op);
10625 const GlobalValue *GV = GA->getGlobal();
10626
10627 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10628
10629 if (Model == TLSModel::LocalExec) {
10630 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10631 } else if (Model == TLSModel::InitialExec) {
10632 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10633 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10634 } else if (Model == TLSModel::LocalDynamic) {
10635 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10636 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10637 // the beginning of the module's TLS region, followed by a DTPREL offset
10638 // calculation.
10639
10640 // These accesses will need deduplicating if there's more than one.
10642
10643 // The call needs a relocation too for linker relaxation. It doesn't make
10644 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10645 // the address.
10646 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10648
10649 // Now we can calculate the offset from TPIDR_EL0 to this module's
10650 // thread-local area.
10651 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10652
10653 // Now use :dtprel_whatever: operations to calculate this variable's offset
10654 // in its thread-storage area.
10655 SDValue HiVar = DAG.getTargetGlobalAddress(
10656 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10657 SDValue LoVar = DAG.getTargetGlobalAddress(
10658 GV, DL, MVT::i64, 0,
10660
10661 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10662 DAG.getTargetConstant(0, DL, MVT::i32)),
10663 0);
10664 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10665 DAG.getTargetConstant(0, DL, MVT::i32)),
10666 0);
10667 } else if (Model == TLSModel::GeneralDynamic) {
10668 // The call needs a relocation too for linker relaxation. It doesn't make
10669 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10670 // the address.
10671 SDValue SymAddr =
10672 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10673
10674 // Finally we can make a call to calculate the offset from tpidr_el0.
10675 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10676 } else
10677 llvm_unreachable("Unsupported ELF TLS access model");
10678
10679 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10680}
10681
10682SDValue
10683AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10684 SelectionDAG &DAG) const {
10685 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10686
10687 SDValue Chain = DAG.getEntryNode();
10688 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10689 SDLoc DL(Op);
10690
10691 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10692
10693 // Load the ThreadLocalStoragePointer from the TEB
10694 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10695 SDValue TLSArray =
10696 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10697 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10698 Chain = TLSArray.getValue(1);
10699
10700 // Load the TLS index from the C runtime;
10701 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10702 // This also does the same as LOADgot, but using a generic i32 load,
10703 // while LOADgot only loads i64.
10704 SDValue TLSIndexHi =
10705 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10706 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10707 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10708 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10709 SDValue TLSIndex =
10710 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10711 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10712 Chain = TLSIndex.getValue(1);
10713
10714 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10715 // offset into the TLSArray.
10716 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10717 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10718 DAG.getConstant(3, DL, PtrVT));
10719 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10720 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10721 MachinePointerInfo());
10722 Chain = TLS.getValue(1);
10723
10724 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10725 const GlobalValue *GV = GA->getGlobal();
10726 SDValue TGAHi = DAG.getTargetGlobalAddress(
10727 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10728 SDValue TGALo = DAG.getTargetGlobalAddress(
10729 GV, DL, PtrVT, 0,
10731
10732 // Add the offset from the start of the .tls section (section base).
10733 SDValue Addr =
10734 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10735 DAG.getTargetConstant(0, DL, MVT::i32)),
10736 0);
10737 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10738 return Addr;
10739}
10740
10741SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10742 SelectionDAG &DAG) const {
10743 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10744 if (DAG.getTarget().useEmulatedTLS())
10745 return LowerToTLSEmulatedModel(GA, DAG);
10746
10747 if (Subtarget->isTargetDarwin())
10748 return LowerDarwinGlobalTLSAddress(Op, DAG);
10749 if (Subtarget->isTargetELF())
10750 return LowerELFGlobalTLSAddress(Op, DAG);
10751 if (Subtarget->isTargetWindows())
10752 return LowerWindowsGlobalTLSAddress(Op, DAG);
10753
10754 llvm_unreachable("Unexpected platform trying to use TLS");
10755}
10756
10757//===----------------------------------------------------------------------===//
10758// PtrAuthGlobalAddress lowering
10759//
10760// We have 3 lowering alternatives to choose from:
10761// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10762// If the GV doesn't need a GOT load (i.e., is locally defined)
10763// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10764//
10765// - LOADgotPAC: similar to LOADgot, with added PAC.
10766// If the GV needs a GOT load, materialize the pointer using the usual
10767// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10768// section is assumed to be read-only (for example, via relro mechanism). See
10769// LowerMOVaddrPAC.
10770//
10771// - LOADauthptrstatic: similar to LOADgot, but use a
10772// special stub slot instead of a GOT slot.
10773// Load a signed pointer for symbol 'sym' from a stub slot named
10774// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10775// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10776// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10777//
10778// All 3 are pseudos that are expand late to longer sequences: this lets us
10779// provide integrity guarantees on the to-be-signed intermediate values.
10780//
10781// LOADauthptrstatic is undesirable because it requires a large section filled
10782// with often similarly-signed pointers, making it a good harvesting target.
10783// Thus, it's only used for ptrauth references to extern_weak to avoid null
10784// checks.
10785
10787 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10788 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10789 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10790 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10791
10792 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10793 // offset alone as a pointer if the symbol wasn't available, which would
10794 // probably break null checks in users. Ptrauth complicates things further:
10795 // error out.
10796 if (TGN->getOffset() != 0)
10798 "unsupported non-zero offset in weak ptrauth global reference");
10799
10800 if (!isNullConstant(AddrDiscriminator))
10801 report_fatal_error("unsupported weak addr-div ptrauth global");
10802
10803 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10804 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10805 {TGA, Key, Discriminator}),
10806 0);
10807}
10808
10809SDValue
10810AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10811 SelectionDAG &DAG) const {
10812 SDValue Ptr = Op.getOperand(0);
10813 uint64_t KeyC = Op.getConstantOperandVal(1);
10814 SDValue AddrDiscriminator = Op.getOperand(2);
10815 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10816 EVT VT = Op.getValueType();
10817 SDLoc DL(Op);
10818
10819 if (KeyC > AArch64PACKey::LAST)
10820 report_fatal_error("key in ptrauth global out of range [0, " +
10821 Twine((int)AArch64PACKey::LAST) + "]");
10822
10823 // Blend only works if the integer discriminator is 16-bit wide.
10824 if (!isUInt<16>(DiscriminatorC))
10826 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10827
10828 // Choosing between 3 lowering alternatives is target-specific.
10829 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10830 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10831
10832 int64_t PtrOffsetC = 0;
10833 if (Ptr.getOpcode() == ISD::ADD) {
10834 PtrOffsetC = Ptr.getConstantOperandVal(1);
10835 Ptr = Ptr.getOperand(0);
10836 }
10837 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10838 const GlobalValue *PtrGV = PtrN->getGlobal();
10839
10840 // Classify the reference to determine whether it needs a GOT load.
10841 const unsigned OpFlags =
10842 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10843 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10844 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10845 "unsupported non-GOT op flags on ptrauth global reference");
10846
10847 // Fold any offset into the GV; our pseudos expect it there.
10848 PtrOffsetC += PtrN->getOffset();
10849 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10850 /*TargetFlags=*/0);
10851 assert(PtrN->getTargetFlags() == 0 &&
10852 "unsupported target flags on ptrauth global");
10853
10854 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10855 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10856 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10857 ? AddrDiscriminator
10858 : DAG.getRegister(AArch64::XZR, MVT::i64);
10859
10860 // No GOT load needed -> MOVaddrPAC
10861 if (!NeedsGOTLoad) {
10862 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10863 return SDValue(
10864 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10865 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10866 0);
10867 }
10868
10869 // GOT load -> LOADgotPAC
10870 // Note that we disallow extern_weak refs to avoid null checks later.
10871 if (!PtrGV->hasExternalWeakLinkage())
10872 return SDValue(
10873 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10874 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10875 0);
10876
10877 // extern_weak ref -> LOADauthptrstatic
10879 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10880 DAG);
10881}
10882
10883// Looks through \param Val to determine the bit that can be used to
10884// check the sign of the value. It returns the unextended value and
10885// the sign bit position.
10886std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10887 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10888 return {Val.getOperand(0),
10889 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10890 1};
10891
10892 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10893 return {Val.getOperand(0),
10894 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10895
10896 return {Val, Val.getValueSizeInBits() - 1};
10897}
10898
10899SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10900 SDValue Chain = Op.getOperand(0);
10901 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10902 SDValue LHS = Op.getOperand(2);
10903 SDValue RHS = Op.getOperand(3);
10904 SDValue Dest = Op.getOperand(4);
10905 SDLoc DL(Op);
10906
10907 MachineFunction &MF = DAG.getMachineFunction();
10908 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10909 // will not be produced, as they are conditional branch instructions that do
10910 // not set flags.
10911 bool ProduceNonFlagSettingCondBr =
10912 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10913
10914 // Handle f128 first, since lowering it will result in comparing the return
10915 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10916 // is expecting to deal with.
10917 if (LHS.getValueType() == MVT::f128) {
10918 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10919
10920 // If softenSetCCOperands returned a scalar, we need to compare the result
10921 // against zero to select between true and false values.
10922 if (!RHS.getNode()) {
10923 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10924 CC = ISD::SETNE;
10925 }
10926 }
10927
10928 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10929 // instruction.
10931 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10932 // Only lower legal XALUO ops.
10933 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10934 return SDValue();
10935
10936 // The actual operation with overflow check.
10938 SDValue Value, Overflow;
10939 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10940
10941 if (CC == ISD::SETNE)
10942 OFCC = getInvertedCondCode(OFCC);
10943 SDValue CCVal = getCondCode(DAG, OFCC);
10944
10945 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10946 Overflow);
10947 }
10948
10949 if (LHS.getValueType().isInteger()) {
10950 assert((LHS.getValueType() == RHS.getValueType()) &&
10951 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10952
10953 // If the RHS of the comparison is zero, we can potentially fold this
10954 // to a specialized branch.
10955 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10956 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10957 if (CC == ISD::SETEQ) {
10958 // See if we can use a TBZ to fold in an AND as well.
10959 // TBZ has a smaller branch displacement than CBZ. If the offset is
10960 // out of bounds, a late MI-layer pass rewrites branches.
10961 // 403.gcc is an example that hits this case.
10962 if (LHS.getOpcode() == ISD::AND &&
10963 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10964 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10965 SDValue Test = LHS.getOperand(0);
10966 uint64_t Mask = LHS.getConstantOperandVal(1);
10967 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
10968 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10969 Dest);
10970 }
10971
10972 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
10973 } else if (CC == ISD::SETNE) {
10974 // See if we can use a TBZ to fold in an AND as well.
10975 // TBZ has a smaller branch displacement than CBZ. If the offset is
10976 // out of bounds, a late MI-layer pass rewrites branches.
10977 // 403.gcc is an example that hits this case.
10978 if (LHS.getOpcode() == ISD::AND &&
10979 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10980 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10981 SDValue Test = LHS.getOperand(0);
10982 uint64_t Mask = LHS.getConstantOperandVal(1);
10983 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
10984 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10985 Dest);
10986 }
10987
10988 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
10989 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10990 // Don't combine AND since emitComparison converts the AND to an ANDS
10991 // (a.k.a. TST) and the test in the test bit and branch instruction
10992 // becomes redundant. This would also increase register pressure.
10993 uint64_t SignBitPos;
10994 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10995 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
10996 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
10997 }
10998 }
10999 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11000 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11001 // Don't combine AND since emitComparison converts the AND to an ANDS
11002 // (a.k.a. TST) and the test in the test bit and branch instruction
11003 // becomes redundant. This would also increase register pressure.
11004 uint64_t SignBitPos;
11005 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11006 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11007 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11008 }
11009
11010 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11011 // larger branch displacement but do prefer CB over cmp + br.
11012 if (Subtarget->hasCMPBR() &&
11014 ProduceNonFlagSettingCondBr) {
11015 SDValue Cond =
11017 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11018 Dest);
11019 }
11020
11021 SDValue CCVal;
11022 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11023 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11024 Cmp);
11025 }
11026
11027 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11028 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11029
11030 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11031 // clean. Some of them require two branches to implement.
11032 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11033 AArch64CC::CondCode CC1, CC2;
11034 changeFPCCToAArch64CC(CC, CC1, CC2);
11035 SDValue CC1Val = getCondCode(DAG, CC1);
11036 SDValue BR1 =
11037 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11038 if (CC2 != AArch64CC::AL) {
11039 SDValue CC2Val = getCondCode(DAG, CC2);
11040 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11041 Cmp);
11042 }
11043
11044 return BR1;
11045}
11046
11047SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11048 SelectionDAG &DAG) const {
11049 if (!Subtarget->isNeonAvailable() &&
11050 !Subtarget->useSVEForFixedLengthVectors())
11051 return SDValue();
11052
11053 EVT VT = Op.getValueType();
11054 EVT IntVT = VT.changeTypeToInteger();
11055 SDLoc DL(Op);
11056
11057 SDValue In1 = Op.getOperand(0);
11058 SDValue In2 = Op.getOperand(1);
11059 EVT SrcVT = In2.getValueType();
11060
11061 if (!SrcVT.bitsEq(VT))
11062 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11063
11064 if (VT.isScalableVector())
11065 IntVT =
11067
11068 if (VT.isFixedLengthVector() &&
11069 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11070 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11071
11072 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11073 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11074
11075 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11076 return convertFromScalableVector(DAG, VT, Res);
11077 }
11078
11079 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11080 // a SVE FCOPYSIGN.
11081 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11082 Subtarget->isSVEorStreamingSVEAvailable()) {
11083 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11084 return SDValue();
11085 EVT SVT = getPackedSVEVectorVT(VT);
11086
11087 SDValue Ins1 =
11088 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11089 DAG.getConstant(0, DL, MVT::i64));
11090 SDValue Ins2 =
11091 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11092 DAG.getConstant(0, DL, MVT::i64));
11093 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11095 DAG.getConstant(0, DL, MVT::i64));
11096 }
11097
11098 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11099 if (VT.isScalableVector())
11100 return getSVESafeBitCast(VT, Op, DAG);
11101
11102 return DAG.getBitcast(VT, Op);
11103 };
11104
11105 SDValue VecVal1, VecVal2;
11106 EVT VecVT;
11107 auto SetVecVal = [&](int Idx = -1) {
11108 if (!VT.isVector()) {
11109 VecVal1 =
11110 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11111 VecVal2 =
11112 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11113 } else {
11114 VecVal1 = BitCast(VecVT, In1, DAG);
11115 VecVal2 = BitCast(VecVT, In2, DAG);
11116 }
11117 };
11118 if (VT.isVector()) {
11119 VecVT = IntVT;
11120 SetVecVal();
11121 } else if (VT == MVT::f64) {
11122 VecVT = MVT::v2i64;
11123 SetVecVal(AArch64::dsub);
11124 } else if (VT == MVT::f32) {
11125 VecVT = MVT::v4i32;
11126 SetVecVal(AArch64::ssub);
11127 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11128 VecVT = MVT::v8i16;
11129 SetVecVal(AArch64::hsub);
11130 } else {
11131 llvm_unreachable("Invalid type for copysign!");
11132 }
11133
11134 unsigned BitWidth = In1.getScalarValueSizeInBits();
11135 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11136
11137 // We want to materialize a mask with every bit but the high bit set, but the
11138 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11139 // 64-bit elements. Instead, materialize all bits set and then negate that.
11140 if (VT == MVT::f64 || VT == MVT::v2f64) {
11141 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11142 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11143 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11144 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11145 }
11146
11147 SDValue BSP =
11148 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11149 if (VT == MVT::f16 || VT == MVT::bf16)
11150 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11151 if (VT == MVT::f32)
11152 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11153 if (VT == MVT::f64)
11154 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11155
11156 return BitCast(VT, BSP, DAG);
11157}
11158
11159SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11160 SelectionDAG &DAG) const {
11162 Attribute::NoImplicitFloat))
11163 return SDValue();
11164
11165 EVT VT = Op.getValueType();
11166 if (VT.isScalableVector() ||
11167 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11168 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11169
11170 bool IsParity = Op.getOpcode() == ISD::PARITY;
11171 SDValue Val = Op.getOperand(0);
11172 SDLoc DL(Op);
11173
11174 // for i32, general parity function using EORs is more efficient compared to
11175 // using floating point
11176 if (VT == MVT::i32 && IsParity)
11177 return SDValue();
11178
11179 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11180 if (VT == MVT::i32 || VT == MVT::i64) {
11181 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11182 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11183 DAG.getUNDEF(ContainerVT), Val,
11184 DAG.getVectorIdxConstant(0, DL));
11185 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11186 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11187 DAG.getVectorIdxConstant(0, DL));
11188 if (IsParity)
11189 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11190 return Val;
11191 }
11192
11193 if (VT == MVT::i128) {
11194 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11195 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11196 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11197 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11198 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11199 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11200 if (IsParity)
11201 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11202 return Val;
11203 }
11204 }
11205
11206 if (!Subtarget->isNeonAvailable())
11207 return SDValue();
11208
11209 // If there is no CNT instruction available, GPR popcount can
11210 // be more efficiently lowered to the following sequence that uses
11211 // AdvSIMD registers/instructions as long as the copies to/from
11212 // the AdvSIMD registers are cheap.
11213 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11214 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11215 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11216 // FMOV X0, D0 // copy result back to integer reg
11217 if (VT == MVT::i32 || VT == MVT::i64) {
11218 if (VT == MVT::i32)
11219 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11220 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11221
11222 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11223 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11224 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11225 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11226 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11227 DAG.getConstant(0, DL, MVT::i64));
11228 if (IsParity)
11229 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11230 return AddV;
11231 } else if (VT == MVT::i128) {
11232 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11233
11234 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11235 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11236 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11237 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11238 DAG.getConstant(0, DL, MVT::i64));
11239 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11240 if (IsParity)
11241 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11242 return AddV;
11243 }
11244
11245 assert(!IsParity && "ISD::PARITY of vector types not supported");
11246
11247 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11248 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11249 "Unexpected type for custom ctpop lowering");
11250
11251 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11252 Val = DAG.getBitcast(VT8Bit, Val);
11253 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11254
11255 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11256 VT.getVectorNumElements() >= 2) {
11257 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11258 SDValue Zeros = DAG.getConstant(0, DL, DT);
11259 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11260
11261 if (VT == MVT::v2i64) {
11262 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11263 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11264 } else if (VT == MVT::v2i32) {
11265 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11266 } else if (VT == MVT::v4i32) {
11267 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11268 } else {
11269 llvm_unreachable("Unexpected type for custom ctpop lowering");
11270 }
11271
11272 return Val;
11273 }
11274
11275 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11276 unsigned EltSize = 8;
11277 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11278 while (EltSize != VT.getScalarSizeInBits()) {
11279 EltSize *= 2;
11280 NumElts /= 2;
11281 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11282 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11283 }
11284
11285 return Val;
11286}
11287
11288SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11289 EVT VT = Op.getValueType();
11290 assert(VT.isScalableVector() ||
11292 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11293
11294 SDLoc DL(Op);
11295 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11296 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11297}
11298
11299SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11300 SelectionDAG &DAG) const {
11301
11302 EVT VT = Op.getValueType();
11303 SDLoc DL(Op);
11304 unsigned Opcode = Op.getOpcode();
11305 ISD::CondCode CC;
11306 switch (Opcode) {
11307 default:
11308 llvm_unreachable("Wrong instruction");
11309 case ISD::SMAX:
11310 CC = ISD::SETGT;
11311 break;
11312 case ISD::SMIN:
11313 CC = ISD::SETLT;
11314 break;
11315 case ISD::UMAX:
11316 CC = ISD::SETUGT;
11317 break;
11318 case ISD::UMIN:
11319 CC = ISD::SETULT;
11320 break;
11321 }
11322
11323 if (VT.isScalableVector() ||
11325 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11326 switch (Opcode) {
11327 default:
11328 llvm_unreachable("Wrong instruction");
11329 case ISD::SMAX:
11330 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11331 case ISD::SMIN:
11332 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11333 case ISD::UMAX:
11334 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11335 case ISD::UMIN:
11336 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11337 }
11338 }
11339
11340 SDValue Op0 = Op.getOperand(0);
11341 SDValue Op1 = Op.getOperand(1);
11342 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11343 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11344}
11345
11346SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11347 SelectionDAG &DAG) const {
11348 EVT VT = Op.getValueType();
11349
11350 if (VT.isScalableVector() ||
11352 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11353 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11354
11355 SDLoc DL(Op);
11356 SDValue REVB;
11357 MVT VST;
11358
11359 switch (VT.getSimpleVT().SimpleTy) {
11360 default:
11361 llvm_unreachable("Invalid type for bitreverse!");
11362
11363 case MVT::v2i32: {
11364 VST = MVT::v8i8;
11365 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11366
11367 break;
11368 }
11369
11370 case MVT::v4i32: {
11371 VST = MVT::v16i8;
11372 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11373
11374 break;
11375 }
11376
11377 case MVT::v1i64: {
11378 VST = MVT::v8i8;
11379 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11380
11381 break;
11382 }
11383
11384 case MVT::v2i64: {
11385 VST = MVT::v16i8;
11386 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11387
11388 break;
11389 }
11390 }
11391
11392 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11393 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11394}
11395
11396// Check whether the continuous comparison sequence.
11397static bool
11398isOrXorChain(SDValue N, unsigned &Num,
11399 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11400 if (Num == MaxXors)
11401 return false;
11402
11403 // Skip the one-use zext
11404 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11405 N = N->getOperand(0);
11406
11407 // The leaf node must be XOR
11408 if (N->getOpcode() == ISD::XOR) {
11409 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11410 Num++;
11411 return true;
11412 }
11413
11414 // All the non-leaf nodes must be OR.
11415 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11416 return false;
11417
11418 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11419 isOrXorChain(N->getOperand(1), Num, WorkList))
11420 return true;
11421 return false;
11422}
11423
11424// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11426 SDValue LHS = N->getOperand(0);
11427 SDValue RHS = N->getOperand(1);
11428 SDLoc DL(N);
11429 EVT VT = N->getValueType(0);
11431
11432 // Only handle integer compares.
11433 if (N->getOpcode() != ISD::SETCC)
11434 return SDValue();
11435
11436 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11437 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11438 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11439 unsigned NumXors = 0;
11440 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11441 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11442 isOrXorChain(LHS, NumXors, WorkList)) {
11443 SDValue XOR0, XOR1;
11444 std::tie(XOR0, XOR1) = WorkList[0];
11445 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11446 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11447 for (unsigned I = 1; I < WorkList.size(); I++) {
11448 std::tie(XOR0, XOR1) = WorkList[I];
11449 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11450 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11451 }
11452
11453 // Exit early by inverting the condition, which help reduce indentations.
11454 return Cmp;
11455 }
11456
11457 return SDValue();
11458}
11459
11460SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11461
11462 if (Op.getValueType().isVector())
11463 return LowerVSETCC(Op, DAG);
11464
11465 bool IsStrict = Op->isStrictFPOpcode();
11466 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11467 unsigned OpNo = IsStrict ? 1 : 0;
11468 SDValue Chain;
11469 if (IsStrict)
11470 Chain = Op.getOperand(0);
11471 SDValue LHS = Op.getOperand(OpNo + 0);
11472 SDValue RHS = Op.getOperand(OpNo + 1);
11473 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11474 SDLoc DL(Op);
11475
11476 // We chose ZeroOrOneBooleanContents, so use zero and one.
11477 EVT VT = Op.getValueType();
11478 SDValue TVal = DAG.getConstant(1, DL, VT);
11479 SDValue FVal = DAG.getConstant(0, DL, VT);
11480
11481 // Handle f128 first, since one possible outcome is a normal integer
11482 // comparison which gets picked up by the next if statement.
11483 if (LHS.getValueType() == MVT::f128) {
11484 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11485 IsSignaling);
11486
11487 // If softenSetCCOperands returned a scalar, use it.
11488 if (!RHS.getNode()) {
11489 assert(LHS.getValueType() == Op.getValueType() &&
11490 "Unexpected setcc expansion!");
11491 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11492 }
11493 }
11494
11495 if (LHS.getValueType().isInteger()) {
11496
11497 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11498
11499 SDValue CCVal;
11501 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11502
11503 // Note that we inverted the condition above, so we reverse the order of
11504 // the true and false operands here. This will allow the setcc to be
11505 // matched to a single CSINC instruction.
11506 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11507 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11508 }
11509
11510 // Now we know we're dealing with FP values.
11511 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11512 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11513
11514 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11515 // and do the comparison.
11516 SDValue Cmp;
11517 if (IsStrict)
11518 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11519 else
11520 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11521
11522 AArch64CC::CondCode CC1, CC2;
11523 changeFPCCToAArch64CC(CC, CC1, CC2);
11524 SDValue Res;
11525 if (CC2 == AArch64CC::AL) {
11526 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11527 CC2);
11528 SDValue CC1Val = getCondCode(DAG, CC1);
11529
11530 // Note that we inverted the condition above, so we reverse the order of
11531 // the true and false operands here. This will allow the setcc to be
11532 // matched to a single CSINC instruction.
11533 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11534 } else {
11535 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11536 // totally clean. Some of them require two CSELs to implement. As is in
11537 // this case, we emit the first CSEL and then emit a second using the output
11538 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11539
11540 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11541 SDValue CC1Val = getCondCode(DAG, CC1);
11542 SDValue CS1 =
11543 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11544
11545 SDValue CC2Val = getCondCode(DAG, CC2);
11546 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11547 }
11548 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11549}
11550
11551SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11552 SelectionDAG &DAG) const {
11553
11554 SDValue LHS = Op.getOperand(0);
11555 SDValue RHS = Op.getOperand(1);
11556 EVT VT = LHS.getValueType();
11557 if (VT != MVT::i32 && VT != MVT::i64)
11558 return SDValue();
11559
11560 SDLoc DL(Op);
11561 SDValue Carry = Op.getOperand(2);
11562 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11563 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11564 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11565 LHS, RHS, InvCarry);
11566
11567 EVT OpVT = Op.getValueType();
11568 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11569 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11570
11571 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11573 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11574 // Inputs are swapped because the condition is inverted. This will allow
11575 // matching with a single CSINC instruction.
11576 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11577 Cmp.getValue(1));
11578}
11579
11580/// Emit vector comparison for floating-point values, producing a mask.
11582 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11583 const SDLoc &DL, SelectionDAG &DAG) {
11584 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11585 "function only supposed to emit natural comparisons");
11586
11587 switch (CC) {
11588 default:
11589 return SDValue();
11590 case AArch64CC::NE: {
11591 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11592 // Use vector semantics for the inversion to potentially save a copy between
11593 // SIMD and regular registers.
11594 if (!LHS.getValueType().isVector()) {
11595 EVT VecVT =
11596 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11597 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11598 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11599 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11600 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11601 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11602 }
11603 return DAG.getNOT(DL, Fcmeq, VT);
11604 }
11605 case AArch64CC::EQ:
11606 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11607 case AArch64CC::GE:
11608 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11609 case AArch64CC::GT:
11610 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11611 case AArch64CC::LE:
11612 if (!NoNans)
11613 return SDValue();
11614 // If we ignore NaNs then we can use to the LS implementation.
11615 [[fallthrough]];
11616 case AArch64CC::LS:
11617 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11618 case AArch64CC::LT:
11619 if (!NoNans)
11620 return SDValue();
11621 // If we ignore NaNs then we can use to the MI implementation.
11622 [[fallthrough]];
11623 case AArch64CC::MI:
11624 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11625 }
11626}
11627
11628/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11629/// values are scalars, try to emit a mask generating vector instruction.
11631 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11632 const SDLoc &DL, SelectionDAG &DAG) {
11633 assert(!LHS.getValueType().isVector());
11634 assert(!RHS.getValueType().isVector());
11635
11636 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11637 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11638 if (!CTVal || !CFVal)
11639 return {};
11640 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11641 !(CTVal->isZero() && CFVal->isAllOnes()))
11642 return {};
11643
11644 if (CTVal->isZero())
11645 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11646
11647 EVT VT = TVal.getValueType();
11648 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11649 return {};
11650
11651 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11652 bool OneNaN = false;
11653 if (LHS == RHS) {
11654 OneNaN = true;
11655 } else if (DAG.isKnownNeverNaN(RHS)) {
11656 OneNaN = true;
11657 RHS = LHS;
11658 } else if (DAG.isKnownNeverNaN(LHS)) {
11659 OneNaN = true;
11660 LHS = RHS;
11661 }
11662 if (OneNaN)
11663 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11664 }
11665
11668 bool ShouldInvert = false;
11669 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11670 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11671 SDValue Cmp2;
11672 if (CC2 != AArch64CC::AL) {
11673 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11674 if (!Cmp2)
11675 return {};
11676 }
11677 if (!Cmp2 && !ShouldInvert)
11678 return Cmp;
11679
11680 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11681 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11682 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11683 Zero);
11684 if (Cmp2) {
11685 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11686 Cmp2, Zero);
11687 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11688 }
11689 if (ShouldInvert)
11690 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11691 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11692 return Cmp;
11693}
11694
11695SDValue AArch64TargetLowering::LowerSELECT_CC(
11698 const SDLoc &DL, SelectionDAG &DAG) const {
11699 // Handle f128 first, because it will result in a comparison of some RTLIB
11700 // call result against zero.
11701 if (LHS.getValueType() == MVT::f128) {
11702 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11703
11704 // If softenSetCCOperands returned a scalar, we need to compare the result
11705 // against zero to select between true and false values.
11706 if (!RHS.getNode()) {
11707 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11708 CC = ISD::SETNE;
11709 }
11710 }
11711
11712 // Also handle f16, for which we need to do a f32 comparison.
11713 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11714 LHS.getValueType() == MVT::bf16) {
11715 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11716 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11717 }
11718
11719 // Next, handle integers.
11720 if (LHS.getValueType().isInteger()) {
11721 assert((LHS.getValueType() == RHS.getValueType()) &&
11722 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11723
11724 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11725 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11726 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11727
11728 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11729 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11730 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11731 // Both require less instructions than compare and conditional select.
11732 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11733 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11734 LHS.getValueType() == RHS.getValueType()) {
11735 EVT VT = LHS.getValueType();
11736 SDValue Shift =
11737 DAG.getNode(ISD::SRA, DL, VT, LHS,
11738 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11739
11740 if (CC == ISD::SETGT)
11741 Shift = DAG.getNOT(DL, Shift, VT);
11742
11743 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11744 }
11745
11746 // Canonicalise absolute difference patterns:
11747 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11748 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11749 //
11750 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11751 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11752 // The second forms can be matched into subs+cneg.
11753 // NOTE: Drop poison generating flags from the negated operand to avoid
11754 // inadvertently propagating poison after the canonicalisation.
11755 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11756 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11757 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11759 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11760 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11761 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11763 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11764 }
11765 }
11766
11767 unsigned Opcode = AArch64ISD::CSEL;
11768
11769 // If both the TVal and the FVal are constants, see if we can swap them in
11770 // order to for a CSINV or CSINC out of them.
11771 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11772 std::swap(TVal, FVal);
11773 std::swap(CTVal, CFVal);
11774 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11775 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11776 std::swap(TVal, FVal);
11777 std::swap(CTVal, CFVal);
11778 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11779 } else if (TVal.getOpcode() == ISD::XOR) {
11780 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11781 // with a CSINV rather than a CSEL.
11782 if (isAllOnesConstant(TVal.getOperand(1))) {
11783 std::swap(TVal, FVal);
11784 std::swap(CTVal, CFVal);
11785 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11786 }
11787 } else if (TVal.getOpcode() == ISD::SUB) {
11788 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11789 // that we can match with a CSNEG rather than a CSEL.
11790 if (isNullConstant(TVal.getOperand(0))) {
11791 std::swap(TVal, FVal);
11792 std::swap(CTVal, CFVal);
11793 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11794 }
11795 } else if (CTVal && CFVal) {
11796 const int64_t TrueVal = CTVal->getSExtValue();
11797 const int64_t FalseVal = CFVal->getSExtValue();
11798 bool Swap = false;
11799
11800 // If both TVal and FVal are constants, see if FVal is the
11801 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11802 // instead of a CSEL in that case.
11803 if (TrueVal == ~FalseVal) {
11804 Opcode = AArch64ISD::CSINV;
11805 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11806 TrueVal == -FalseVal) {
11807 Opcode = AArch64ISD::CSNEG;
11808 } else if (TVal.getValueType() == MVT::i32) {
11809 // If our operands are only 32-bit wide, make sure we use 32-bit
11810 // arithmetic for the check whether we can use CSINC. This ensures that
11811 // the addition in the check will wrap around properly in case there is
11812 // an overflow (which would not be the case if we do the check with
11813 // 64-bit arithmetic).
11814 const uint32_t TrueVal32 = CTVal->getZExtValue();
11815 const uint32_t FalseVal32 = CFVal->getZExtValue();
11816
11817 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11818 Opcode = AArch64ISD::CSINC;
11819
11820 if (TrueVal32 > FalseVal32) {
11821 Swap = true;
11822 }
11823 }
11824 } else {
11825 // 64-bit check whether we can use CSINC.
11826 const uint64_t TrueVal64 = TrueVal;
11827 const uint64_t FalseVal64 = FalseVal;
11828
11829 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11830 Opcode = AArch64ISD::CSINC;
11831
11832 if (TrueVal > FalseVal) {
11833 Swap = true;
11834 }
11835 }
11836 }
11837
11838 // Swap TVal and FVal if necessary.
11839 if (Swap) {
11840 std::swap(TVal, FVal);
11841 std::swap(CTVal, CFVal);
11842 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11843 }
11844
11845 if (Opcode != AArch64ISD::CSEL) {
11846 // Drop FVal since we can get its value by simply inverting/negating
11847 // TVal.
11848 FVal = TVal;
11849 }
11850 }
11851
11852 // Avoid materializing a constant when possible by reusing a known value in
11853 // a register. However, don't perform this optimization if the known value
11854 // is one, zero or negative one in the case of a CSEL. We can always
11855 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11856 // FVal, respectively.
11857 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11858 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11859 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11861 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11862 // "a != C ? x : a" to avoid materializing C.
11863 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11864 TVal = LHS;
11865 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11866 FVal = LHS;
11867 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11868 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11869 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11870 // avoid materializing C.
11872 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11873 Opcode = AArch64ISD::CSINV;
11874 TVal = LHS;
11875 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11876 }
11877 }
11878
11879 SDValue CCVal;
11880 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11881 EVT VT = TVal.getValueType();
11882 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11883 }
11884
11885 // Now we know we're dealing with FP values.
11886 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11887 LHS.getValueType() == MVT::f64);
11888 assert(LHS.getValueType() == RHS.getValueType());
11889 EVT VT = TVal.getValueType();
11890
11891 // If the purpose of the comparison is to select between all ones
11892 // or all zeros, try to use a vector comparison because the operands are
11893 // already stored in SIMD registers.
11894 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11895 switch (U->getOpcode()) {
11896 default:
11897 return false;
11900 case AArch64ISD::DUP:
11901 return true;
11902 }
11903 })) {
11904 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11905 SDValue VectorCmp =
11906 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11907 if (VectorCmp)
11908 return VectorCmp;
11909 }
11910
11911 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11912
11913 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11914 // clean. Some of them require two CSELs to implement.
11915 AArch64CC::CondCode CC1, CC2;
11916 changeFPCCToAArch64CC(CC, CC1, CC2);
11917
11918 if (Flags.hasNoSignedZeros()) {
11919 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11920 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11921 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11922 if (RHSVal && RHSVal->isZero()) {
11923 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11924 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11925
11926 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11927 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11928 TVal = LHS;
11929 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11930 CFVal && CFVal->isZero() &&
11931 FVal.getValueType() == LHS.getValueType())
11932 FVal = LHS;
11933 }
11934 }
11935
11936 // Emit first, and possibly only, CSEL.
11937 SDValue CC1Val = getCondCode(DAG, CC1);
11938 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11939
11940 // If we need a second CSEL, emit it, using the output of the first as the
11941 // RHS. We're effectively OR'ing the two CC's together.
11942 if (CC2 != AArch64CC::AL) {
11943 SDValue CC2Val = getCondCode(DAG, CC2);
11944 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11945 }
11946
11947 // Otherwise, return the output of the first CSEL.
11948 return CS1;
11949}
11950
11951SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11952 SelectionDAG &DAG) const {
11953 EVT Ty = Op.getValueType();
11954 auto Idx = Op.getConstantOperandAPInt(2);
11955 int64_t IdxVal = Idx.getSExtValue();
11956 assert(Ty.isScalableVector() &&
11957 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11958
11959 // We can use the splice instruction for certain index values where we are
11960 // able to efficiently generate the correct predicate. The index will be
11961 // inverted and used directly as the input to the ptrue instruction, i.e.
11962 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11963 // splice predicate. However, we can only do this if we can guarantee that
11964 // there are enough elements in the vector, hence we check the index <= min
11965 // number of elements.
11966 std::optional<unsigned> PredPattern;
11967 if (Ty.isScalableVector() && IdxVal < 0 &&
11968 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11969 std::nullopt) {
11970 SDLoc DL(Op);
11971
11972 // Create a predicate where all but the last -IdxVal elements are false.
11973 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11974 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11975 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11976
11977 // Now splice the two inputs together using the predicate.
11978 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11979 Op.getOperand(1));
11980 }
11981
11982 // We can select to an EXT instruction when indexing the first 256 bytes.
11984 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11985 return Op;
11986
11987 return SDValue();
11988}
11989
11990SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11991 SelectionDAG &DAG) const {
11992 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11993 SDValue LHS = Op.getOperand(0);
11994 SDValue RHS = Op.getOperand(1);
11995 SDValue TVal = Op.getOperand(2);
11996 SDValue FVal = Op.getOperand(3);
11997 SDNodeFlags Flags = Op->getFlags();
11998 SDLoc DL(Op);
11999 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12000}
12001
12002SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12003 SelectionDAG &DAG) const {
12004 SDValue CCVal = Op->getOperand(0);
12005 SDValue TVal = Op->getOperand(1);
12006 SDValue FVal = Op->getOperand(2);
12007 SDLoc DL(Op);
12008
12009 EVT Ty = Op.getValueType();
12010 if (Ty == MVT::aarch64svcount) {
12011 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12012 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12013 SDValue Sel =
12014 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12015 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12016 }
12017
12018 if (Ty.isScalableVector()) {
12019 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12020 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12021 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12022 }
12023
12024 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12025 // FIXME: Ideally this would be the same as above using i1 types, however
12026 // for the moment we can't deal with fixed i1 vector types properly, so
12027 // instead extend the predicate to a result type sized integer vector.
12028 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12029 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12030 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12031 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12032 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12033 }
12034
12035 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12036 // instruction.
12037 if (ISD::isOverflowIntrOpRes(CCVal)) {
12038 // Only lower legal XALUO ops.
12039 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12040 return SDValue();
12041
12043 SDValue Value, Overflow;
12044 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12045 SDValue CCVal = getCondCode(DAG, OFCC);
12046
12047 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12048 CCVal, Overflow);
12049 }
12050
12051 // Lower it the same way as we would lower a SELECT_CC node.
12052 ISD::CondCode CC;
12053 SDValue LHS, RHS;
12054 if (CCVal.getOpcode() == ISD::SETCC) {
12055 LHS = CCVal.getOperand(0);
12056 RHS = CCVal.getOperand(1);
12057 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12058 } else {
12059 LHS = CCVal;
12060 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12061 CC = ISD::SETNE;
12062 }
12063
12064 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12065 // order to use FCSELSrrr
12066 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12067 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12068 DAG.getUNDEF(MVT::f32), TVal);
12069 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12070 DAG.getUNDEF(MVT::f32), FVal);
12071 }
12072
12073 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12074 Op->getFlags(), DL, DAG);
12075
12076 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12077 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12078 }
12079
12080 return Res;
12081}
12082
12083SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12084 SelectionDAG &DAG) const {
12085 // Jump table entries as PC relative offsets. No additional tweaking
12086 // is necessary here. Just get the address of the jump table.
12087 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12088
12091 !Subtarget->isTargetMachO())
12092 return getAddrLarge(JT, DAG);
12093 if (CM == CodeModel::Tiny)
12094 return getAddrTiny(JT, DAG);
12095 return getAddr(JT, DAG);
12096}
12097
12098SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12099 SelectionDAG &DAG) const {
12100 // Jump table entries as PC relative offsets. No additional tweaking
12101 // is necessary here. Just get the address of the jump table.
12102 SDLoc DL(Op);
12103 SDValue JT = Op.getOperand(1);
12104 SDValue Entry = Op.getOperand(2);
12105 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12106
12107 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12108 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12109
12110 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12111 // sequence later, to guarantee the integrity of the intermediate values.
12113 "aarch64-jump-table-hardening")) {
12115 if (Subtarget->isTargetMachO()) {
12116 if (CM != CodeModel::Small && CM != CodeModel::Large)
12117 report_fatal_error("Unsupported code-model for hardened jump-table");
12118 } else {
12119 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12120 assert(Subtarget->isTargetELF() &&
12121 "jump table hardening only supported on MachO/ELF");
12122 if (CM != CodeModel::Small)
12123 report_fatal_error("Unsupported code-model for hardened jump-table");
12124 }
12125
12126 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12127 Entry, SDValue());
12128 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12129 DAG.getTargetJumpTable(JTI, MVT::i32),
12130 X16Copy.getValue(0), X16Copy.getValue(1));
12131 return SDValue(B, 0);
12132 }
12133
12134 SDNode *Dest =
12135 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12136 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12137 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12138 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12139}
12140
12141SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12142 SDValue Chain = Op.getOperand(0);
12143 SDValue Dest = Op.getOperand(1);
12144
12145 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12146 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12147 if (Dest->isMachineOpcode() &&
12148 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12149 return SDValue();
12150
12151 const MachineFunction &MF = DAG.getMachineFunction();
12152 std::optional<uint16_t> BADisc =
12153 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12154 if (!BADisc)
12155 return SDValue();
12156
12157 SDLoc DL(Op);
12158
12159 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12161 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12162
12163 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12164 {Dest, Key, Disc, AddrDisc, Chain});
12165 return SDValue(BrA, 0);
12166}
12167
12168SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12169 SelectionDAG &DAG) const {
12170 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12172 if (CM == CodeModel::Large) {
12173 // Use the GOT for the large code model on iOS.
12174 if (Subtarget->isTargetMachO()) {
12175 return getGOT(CP, DAG);
12176 }
12178 return getAddrLarge(CP, DAG);
12179 } else if (CM == CodeModel::Tiny) {
12180 return getAddrTiny(CP, DAG);
12181 }
12182 return getAddr(CP, DAG);
12183}
12184
12185SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12186 SelectionDAG &DAG) const {
12187 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12188 const BlockAddress *BA = BAN->getBlockAddress();
12189
12190 if (std::optional<uint16_t> BADisc =
12191 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12192 *BA->getFunction())) {
12193 SDLoc DL(Op);
12194
12195 // This isn't cheap, but BRIND is rare.
12196 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12197
12198 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12199
12201 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12202
12203 SDNode *MOV =
12204 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12205 {TargetBA, Key, AddrDisc, Disc});
12206 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12207 SDValue(MOV, 1));
12208 }
12209
12211 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12213 return getAddrLarge(BAN, DAG);
12214 } else if (CM == CodeModel::Tiny) {
12215 return getAddrTiny(BAN, DAG);
12216 }
12217 return getAddr(BAN, DAG);
12218}
12219
12220SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12221 SelectionDAG &DAG) const {
12222 AArch64FunctionInfo *FuncInfo =
12223 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12224
12225 SDLoc DL(Op);
12226 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12228 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12229 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12230 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12231 MachinePointerInfo(SV));
12232}
12233
12234SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12235 SelectionDAG &DAG) const {
12236 MachineFunction &MF = DAG.getMachineFunction();
12237 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12238
12239 SDLoc DL(Op);
12240 SDValue FR;
12241 if (Subtarget->isWindowsArm64EC()) {
12242 // With the Arm64EC ABI, we compute the address of the varargs save area
12243 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12244 // but calls from an entry thunk can pass in a different address.
12245 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12246 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12247 uint64_t StackOffset;
12248 if (FuncInfo->getVarArgsGPRSize() > 0)
12249 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12250 else
12251 StackOffset = FuncInfo->getVarArgsStackOffset();
12252 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12253 DAG.getConstant(StackOffset, DL, MVT::i64));
12254 } else {
12255 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12256 ? FuncInfo->getVarArgsGPRIndex()
12257 : FuncInfo->getVarArgsStackIndex(),
12259 }
12260 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12261 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12262 MachinePointerInfo(SV));
12263}
12264
12265SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12266 SelectionDAG &DAG) const {
12267 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12268 // Standard, section B.3.
12269 MachineFunction &MF = DAG.getMachineFunction();
12270 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12271 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12272 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12273 auto PtrVT = getPointerTy(DAG.getDataLayout());
12274 SDLoc DL(Op);
12275
12276 SDValue Chain = Op.getOperand(0);
12277 SDValue VAList = Op.getOperand(1);
12278 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12280
12281 // void *__stack at offset 0
12282 unsigned Offset = 0;
12283 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12284 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12285 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12286 MachinePointerInfo(SV), Align(PtrSize)));
12287
12288 // void *__gr_top at offset 8 (4 on ILP32)
12289 Offset += PtrSize;
12290 int GPRSize = FuncInfo->getVarArgsGPRSize();
12291 if (GPRSize > 0) {
12292 SDValue GRTop, GRTopAddr;
12293
12294 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12295 DAG.getConstant(Offset, DL, PtrVT));
12296
12297 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12298 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12299 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12300 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12301
12302 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12303 MachinePointerInfo(SV, Offset),
12304 Align(PtrSize)));
12305 }
12306
12307 // void *__vr_top at offset 16 (8 on ILP32)
12308 Offset += PtrSize;
12309 int FPRSize = FuncInfo->getVarArgsFPRSize();
12310 if (FPRSize > 0) {
12311 SDValue VRTop, VRTopAddr;
12312 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12313 DAG.getConstant(Offset, DL, PtrVT));
12314
12315 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12316 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12317 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12318 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12319
12320 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12321 MachinePointerInfo(SV, Offset),
12322 Align(PtrSize)));
12323 }
12324
12325 // int __gr_offs at offset 24 (12 on ILP32)
12326 Offset += PtrSize;
12327 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12328 DAG.getConstant(Offset, DL, PtrVT));
12329 MemOps.push_back(
12330 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12331 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12332
12333 // int __vr_offs at offset 28 (16 on ILP32)
12334 Offset += 4;
12335 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12336 DAG.getConstant(Offset, DL, PtrVT));
12337 MemOps.push_back(
12338 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12339 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12340
12341 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12342}
12343
12344SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12345 SelectionDAG &DAG) const {
12346 MachineFunction &MF = DAG.getMachineFunction();
12347 Function &F = MF.getFunction();
12348
12349 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12350 return LowerWin64_VASTART(Op, DAG);
12351 else if (Subtarget->isTargetDarwin())
12352 return LowerDarwin_VASTART(Op, DAG);
12353 else
12354 return LowerAAPCS_VASTART(Op, DAG);
12355}
12356
12357SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12358 SelectionDAG &DAG) const {
12359 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12360 // pointer.
12361 SDLoc DL(Op);
12362 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12363 unsigned VaListSize =
12364 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12365 ? PtrSize
12366 : Subtarget->isTargetILP32() ? 20 : 32;
12367 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12368 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12369
12370 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12371 DAG.getConstant(VaListSize, DL, MVT::i32),
12372 Align(PtrSize), false, false, /*CI=*/nullptr,
12373 std::nullopt, MachinePointerInfo(DestSV),
12374 MachinePointerInfo(SrcSV));
12375}
12376
12377SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12378 assert(Subtarget->isTargetDarwin() &&
12379 "automatic va_arg instruction only works on Darwin");
12380
12381 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12382 EVT VT = Op.getValueType();
12383 SDLoc DL(Op);
12384 SDValue Chain = Op.getOperand(0);
12385 SDValue Addr = Op.getOperand(1);
12386 MaybeAlign Align(Op.getConstantOperandVal(3));
12387 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12388 auto PtrVT = getPointerTy(DAG.getDataLayout());
12389 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12390 SDValue VAList =
12391 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12392 Chain = VAList.getValue(1);
12393 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12394
12395 if (VT.isScalableVector())
12396 report_fatal_error("Passing SVE types to variadic functions is "
12397 "currently not supported");
12398
12399 if (Align && *Align > MinSlotSize) {
12400 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12401 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12402 VAList =
12403 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12404 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12405 }
12406
12407 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12408 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12409
12410 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12411 // up to 64 bits. At the very least, we have to increase the striding of the
12412 // vaargs list to match this, and for FP values we need to introduce
12413 // FP_ROUND nodes as well.
12414 if (VT.isInteger() && !VT.isVector())
12415 ArgSize = std::max(ArgSize, MinSlotSize);
12416 bool NeedFPTrunc = false;
12417 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12418 ArgSize = 8;
12419 NeedFPTrunc = true;
12420 }
12421
12422 // Increment the pointer, VAList, to the next vaarg
12423 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12424 DAG.getConstant(ArgSize, DL, PtrVT));
12425 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12426
12427 // Store the incremented VAList to the legalized pointer
12428 SDValue APStore =
12429 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12430
12431 // Load the actual argument out of the pointer VAList
12432 if (NeedFPTrunc) {
12433 // Load the value as an f64.
12434 SDValue WideFP =
12435 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12436 // Round the value down to an f32.
12437 SDValue NarrowFP =
12438 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12439 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12440 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12441 // Merge the rounded value with the chain output of the load.
12442 return DAG.getMergeValues(Ops, DL);
12443 }
12444
12445 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12446}
12447
12448SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12449 SelectionDAG &DAG) const {
12450 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12451 MFI.setFrameAddressIsTaken(true);
12452
12453 EVT VT = Op.getValueType();
12454 SDLoc DL(Op);
12455 unsigned Depth = Op.getConstantOperandVal(0);
12456 SDValue FrameAddr =
12457 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12458 while (Depth--)
12459 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12460 MachinePointerInfo());
12461
12462 if (Subtarget->isTargetILP32())
12463 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12464 DAG.getValueType(VT));
12465
12466 return FrameAddr;
12467}
12468
12469SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12470 SelectionDAG &DAG) const {
12471 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12472
12473 EVT VT = getPointerTy(DAG.getDataLayout());
12474 int FI = MFI.CreateFixedObject(4, 0, false);
12475 return DAG.getFrameIndex(FI, VT);
12476}
12477
12478#define GET_REGISTER_MATCHER
12479#include "AArch64GenAsmMatcher.inc"
12480
12481// FIXME? Maybe this could be a TableGen attribute on some registers and
12482// this table could be generated automatically from RegInfo.
12483Register AArch64TargetLowering::
12484getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12486 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12487 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12488 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12489 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12490 !MRI->isReservedReg(MF, Reg))
12491 Reg = Register();
12492 }
12493 return Reg;
12494}
12495
12496SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12497 SelectionDAG &DAG) const {
12499
12500 EVT VT = Op.getValueType();
12501 SDLoc DL(Op);
12502
12503 SDValue FrameAddr =
12504 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12506
12507 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12508}
12509
12510SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12511 SelectionDAG &DAG) const {
12512 MachineFunction &MF = DAG.getMachineFunction();
12513 MachineFrameInfo &MFI = MF.getFrameInfo();
12514 MFI.setReturnAddressIsTaken(true);
12515
12516 EVT VT = Op.getValueType();
12517 SDLoc DL(Op);
12518 unsigned Depth = Op.getConstantOperandVal(0);
12519 SDValue ReturnAddress;
12520 if (Depth) {
12521 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12523 ReturnAddress = DAG.getLoad(
12524 VT, DL, DAG.getEntryNode(),
12525 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12526 } else {
12527 // Return LR, which contains the return address. Mark it an implicit
12528 // live-in.
12529 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12530 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12531 }
12532
12533 // The XPACLRI instruction assembles to a hint-space instruction before
12534 // Armv8.3-A therefore this instruction can be safely used for any pre
12535 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12536 // that instead.
12537 SDNode *St;
12538 if (Subtarget->hasPAuth()) {
12539 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12540 } else {
12541 // XPACLRI operates on LR therefore we must move the operand accordingly.
12542 SDValue Chain =
12543 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12544 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12545 }
12546 return SDValue(St, 0);
12547}
12548
12549/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12550/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12551SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12552 SelectionDAG &DAG) const {
12553 SDValue Lo, Hi;
12554 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12555 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12556}
12557
12559 const GlobalAddressSDNode *GA) const {
12560 // Offsets are folded in the DAG combine rather than here so that we can
12561 // intelligently choose an offset based on the uses.
12562 return false;
12563}
12564
12566 bool OptForSize) const {
12567 bool IsLegal = false;
12568 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12569 // 16-bit case when target has full fp16 support.
12570 // We encode bf16 bit patterns as if they were fp16. This results in very
12571 // strange looking assembly but should populate the register with appropriate
12572 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12573 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12574 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12575 // FIXME: We should be able to handle f128 as well with a clever lowering.
12576 const APInt ImmInt = Imm.bitcastToAPInt();
12577 if (VT == MVT::f64)
12578 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12579 else if (VT == MVT::f32)
12580 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12581 else if (VT == MVT::f16 || VT == MVT::bf16)
12582 IsLegal =
12583 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12584 Imm.isPosZero();
12585
12586 // If we can not materialize in immediate field for fmov, check if the
12587 // value can be encoded as the immediate operand of a logical instruction.
12588 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12589 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12590 // generate that fmov.
12591 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12592 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12593 // however the mov+fmov sequence is always better because of the reduced
12594 // cache pressure. The timings are still the same if you consider
12595 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12596 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12599 assert(Insn.size() <= 4 &&
12600 "Should be able to build any value with at most 4 moves");
12601 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12602 IsLegal = Insn.size() <= Limit;
12603 }
12604
12605 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12606 << " imm value: "; Imm.dump(););
12607 return IsLegal;
12608}
12609
12610//===----------------------------------------------------------------------===//
12611// AArch64 Optimization Hooks
12612//===----------------------------------------------------------------------===//
12613
12614static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12615 SDValue Operand, SelectionDAG &DAG,
12616 int &ExtraSteps) {
12617 EVT VT = Operand.getValueType();
12618 if ((ST->hasNEON() &&
12619 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12620 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12621 VT == MVT::v4f32)) ||
12622 (ST->hasSVE() &&
12623 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12625 // For the reciprocal estimates, convergence is quadratic, so the number
12626 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12627 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12628 // the result for float (23 mantissa bits) is 2 and for double (52
12629 // mantissa bits) is 3.
12630 constexpr unsigned AccurateBits = 8;
12631 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12632 ExtraSteps = DesiredBits <= AccurateBits
12633 ? 0
12634 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12635 }
12636
12637 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12638 }
12639
12640 return SDValue();
12641}
12642
12643SDValue
12644AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12645 const DenormalMode &Mode) const {
12646 SDLoc DL(Op);
12647 EVT VT = Op.getValueType();
12648 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12649 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12650 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12651}
12652
12653SDValue
12654AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12655 SelectionDAG &DAG) const {
12656 return Op;
12657}
12658
12659SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12660 SelectionDAG &DAG, int Enabled,
12661 int &ExtraSteps,
12662 bool &UseOneConst,
12663 bool Reciprocal) const {
12665 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12666 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12667 DAG, ExtraSteps)) {
12668 SDLoc DL(Operand);
12669 EVT VT = Operand.getValueType();
12670
12671 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12672 SDNodeFlags Flags =
12674
12675 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12676 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12677 for (int i = ExtraSteps; i > 0; --i) {
12678 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12679 Flags);
12680 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12681 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12682 }
12683 if (!Reciprocal)
12684 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12685
12686 ExtraSteps = 0;
12687 return Estimate;
12688 }
12689
12690 return SDValue();
12691}
12692
12693SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12694 SelectionDAG &DAG, int Enabled,
12695 int &ExtraSteps) const {
12697 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12698 DAG, ExtraSteps)) {
12699 SDLoc DL(Operand);
12700 EVT VT = Operand.getValueType();
12701
12703
12704 // Newton reciprocal iteration: E * (2 - X * E)
12705 // AArch64 reciprocal iteration instruction: (2 - M * N)
12706 for (int i = ExtraSteps; i > 0; --i) {
12707 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12708 Estimate, Flags);
12709 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12710 }
12711
12712 ExtraSteps = 0;
12713 return Estimate;
12714 }
12715
12716 return SDValue();
12717}
12718
12719//===----------------------------------------------------------------------===//
12720// AArch64 Inline Assembly Support
12721//===----------------------------------------------------------------------===//
12722
12723// Table of Constraints
12724// TODO: This is the current set of constraints supported by ARM for the
12725// compiler, not all of them may make sense.
12726//
12727// r - A general register
12728// w - An FP/SIMD register of some size in the range v0-v31
12729// x - An FP/SIMD register of some size in the range v0-v15
12730// I - Constant that can be used with an ADD instruction
12731// J - Constant that can be used with a SUB instruction
12732// K - Constant that can be used with a 32-bit logical instruction
12733// L - Constant that can be used with a 64-bit logical instruction
12734// M - Constant that can be used as a 32-bit MOV immediate
12735// N - Constant that can be used as a 64-bit MOV immediate
12736// Q - A memory reference with base register and no offset
12737// S - A symbolic address
12738// Y - Floating point constant zero
12739// Z - Integer constant zero
12740//
12741// Note that general register operands will be output using their 64-bit x
12742// register name, whatever the size of the variable, unless the asm operand
12743// is prefixed by the %w modifier. Floating-point and SIMD register operands
12744// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12745// %q modifier.
12746const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12747 // At this point, we have to lower this constraint to something else, so we
12748 // lower it to an "r" or "w". However, by doing this we will force the result
12749 // to be in register, while the X constraint is much more permissive.
12750 //
12751 // Although we are correct (we are free to emit anything, without
12752 // constraints), we might break use cases that would expect us to be more
12753 // efficient and emit something else.
12754 if (!Subtarget->hasFPARMv8())
12755 return "r";
12756
12757 if (ConstraintVT.isFloatingPoint())
12758 return "w";
12759
12760 if (ConstraintVT.isVector() &&
12761 (ConstraintVT.getSizeInBits() == 64 ||
12762 ConstraintVT.getSizeInBits() == 128))
12763 return "w";
12764
12765 return "r";
12766}
12767
12769
12770// Returns a {Reg, RegisterClass} tuple if the constraint is
12771// a specific predicate register.
12772//
12773// For some constraint like "{pn3}" the default path in
12774// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12775// suitable register class for this register is "PPRorPNR", after which it
12776// determines that nxv16i1 is an appropriate type for the constraint, which is
12777// not what we want. The code here pre-empts this by matching the register
12778// explicitly.
12779static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12781 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12782 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12783 return std::nullopt;
12784
12785 bool IsPredicate = Constraint[1] == 'p';
12786 Constraint = Constraint.substr(2, Constraint.size() - 3);
12787 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12788 if (IsPredicateAsCount)
12789 Constraint = Constraint.drop_front(1);
12790
12791 unsigned V;
12792 if (Constraint.getAsInteger(10, V) || V > 31)
12793 return std::nullopt;
12794
12795 if (IsPredicateAsCount)
12796 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12797 if (IsPredicate)
12798 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12799 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12800}
12801
12802static std::optional<PredicateConstraint>
12805 .Case("Uph", PredicateConstraint::Uph)
12808 .Default(std::nullopt);
12809}
12810
12811static const TargetRegisterClass *
12813 if (VT != MVT::aarch64svcount &&
12814 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12815 return nullptr;
12816
12817 switch (Constraint) {
12819 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12820 : &AArch64::PPR_p8to15RegClass;
12822 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12823 : &AArch64::PPR_3bRegClass;
12825 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12826 : &AArch64::PPRRegClass;
12827 }
12828
12829 llvm_unreachable("Missing PredicateConstraint!");
12830}
12831
12833
12834static std::optional<ReducedGprConstraint>
12837 .Case("Uci", ReducedGprConstraint::Uci)
12839 .Default(std::nullopt);
12840}
12841
12842static const TargetRegisterClass *
12844 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12845 return nullptr;
12846
12847 switch (Constraint) {
12849 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12851 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12852 }
12853
12854 llvm_unreachable("Missing ReducedGprConstraint!");
12855}
12856
12857// The set of cc code supported is from
12858// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12861 .Case("{@cchi}", AArch64CC::HI)
12862 .Case("{@cccs}", AArch64CC::HS)
12863 .Case("{@cclo}", AArch64CC::LO)
12864 .Case("{@ccls}", AArch64CC::LS)
12865 .Case("{@cccc}", AArch64CC::LO)
12866 .Case("{@cceq}", AArch64CC::EQ)
12867 .Case("{@ccgt}", AArch64CC::GT)
12868 .Case("{@ccge}", AArch64CC::GE)
12869 .Case("{@cclt}", AArch64CC::LT)
12870 .Case("{@ccle}", AArch64CC::LE)
12871 .Case("{@cchs}", AArch64CC::HS)
12872 .Case("{@ccne}", AArch64CC::NE)
12873 .Case("{@ccvc}", AArch64CC::VC)
12874 .Case("{@ccpl}", AArch64CC::PL)
12875 .Case("{@ccvs}", AArch64CC::VS)
12876 .Case("{@ccmi}", AArch64CC::MI)
12878 return Cond;
12879}
12880
12881/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12882/// WZR, invert(<cond>)'.
12884 SelectionDAG &DAG) {
12885 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12886 DAG.getConstant(0, DL, MVT::i32),
12887 DAG.getConstant(0, DL, MVT::i32),
12888 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12889}
12890
12891// Lower @cc flag output via getSETCC.
12892SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12893 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12894 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12895 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12896 if (Cond == AArch64CC::Invalid)
12897 return SDValue();
12898 // The output variable should be a scalar integer.
12899 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12900 OpInfo.ConstraintVT.getSizeInBits() < 8)
12901 report_fatal_error("Flag output operand is of invalid type");
12902
12903 // Get NZCV register. Only update chain when copyfrom is glued.
12904 if (Glue.getNode()) {
12905 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12906 Chain = Glue.getValue(1);
12907 } else
12908 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12909 // Extract CC code.
12910 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12911
12913
12914 // Truncate or ZERO_EXTEND based on value types.
12915 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12916 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12917 else
12918 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12919
12920 return Result;
12921}
12922
12923/// getConstraintType - Given a constraint letter, return the type of
12924/// constraint it is for this target.
12926AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12927 if (Constraint.size() == 1) {
12928 switch (Constraint[0]) {
12929 default:
12930 break;
12931 case 'x':
12932 case 'w':
12933 case 'y':
12934 return C_RegisterClass;
12935 // An address with a single base register. Due to the way we
12936 // currently handle addresses it is the same as 'r'.
12937 case 'Q':
12938 return C_Memory;
12939 case 'I':
12940 case 'J':
12941 case 'K':
12942 case 'L':
12943 case 'M':
12944 case 'N':
12945 case 'Y':
12946 case 'Z':
12947 return C_Immediate;
12948 case 'z':
12949 case 'S': // A symbol or label reference with a constant offset
12950 return C_Other;
12951 }
12952 } else if (parsePredicateConstraint(Constraint))
12953 return C_RegisterClass;
12954 else if (parseReducedGprConstraint(Constraint))
12955 return C_RegisterClass;
12956 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12957 return C_Other;
12958 return TargetLowering::getConstraintType(Constraint);
12959}
12960
12961/// Examine constraint type and operand type and determine a weight value.
12962/// This object must already have been set up with the operand type
12963/// and the current alternative constraint selected.
12965AArch64TargetLowering::getSingleConstraintMatchWeight(
12966 AsmOperandInfo &info, const char *constraint) const {
12968 Value *CallOperandVal = info.CallOperandVal;
12969 // If we don't have a value, we can't do a match,
12970 // but allow it at the lowest weight.
12971 if (!CallOperandVal)
12972 return CW_Default;
12973 Type *type = CallOperandVal->getType();
12974 // Look at the constraint type.
12975 switch (*constraint) {
12976 default:
12978 break;
12979 case 'x':
12980 case 'w':
12981 case 'y':
12982 if (type->isFloatingPointTy() || type->isVectorTy())
12983 weight = CW_Register;
12984 break;
12985 case 'z':
12986 weight = CW_Constant;
12987 break;
12988 case 'U':
12989 if (parsePredicateConstraint(constraint) ||
12990 parseReducedGprConstraint(constraint))
12991 weight = CW_Register;
12992 break;
12993 }
12994 return weight;
12995}
12996
12997std::pair<unsigned, const TargetRegisterClass *>
12998AArch64TargetLowering::getRegForInlineAsmConstraint(
12999 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13000 if (Constraint.size() == 1) {
13001 switch (Constraint[0]) {
13002 case 'r':
13003 if (VT.isScalableVector())
13004 return std::make_pair(0U, nullptr);
13005 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13006 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13007 if (VT.getFixedSizeInBits() == 64)
13008 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13009 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13010 case 'w': {
13011 if (!Subtarget->hasFPARMv8())
13012 break;
13013 if (VT.isScalableVector()) {
13014 if (VT.getVectorElementType() != MVT::i1)
13015 return std::make_pair(0U, &AArch64::ZPRRegClass);
13016 return std::make_pair(0U, nullptr);
13017 }
13018 if (VT == MVT::Other)
13019 break;
13020 uint64_t VTSize = VT.getFixedSizeInBits();
13021 if (VTSize == 16)
13022 return std::make_pair(0U, &AArch64::FPR16RegClass);
13023 if (VTSize == 32)
13024 return std::make_pair(0U, &AArch64::FPR32RegClass);
13025 if (VTSize == 64)
13026 return std::make_pair(0U, &AArch64::FPR64RegClass);
13027 if (VTSize == 128)
13028 return std::make_pair(0U, &AArch64::FPR128RegClass);
13029 break;
13030 }
13031 // The instructions that this constraint is designed for can
13032 // only take 128-bit registers so just use that regclass.
13033 case 'x':
13034 if (!Subtarget->hasFPARMv8())
13035 break;
13036 if (VT.isScalableVector())
13037 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13038 if (VT.getSizeInBits() == 128)
13039 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13040 break;
13041 case 'y':
13042 if (!Subtarget->hasFPARMv8())
13043 break;
13044 if (VT.isScalableVector())
13045 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13046 break;
13047 }
13048 } else {
13049 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13050 // SME functions that are not in streaming mode, should
13051 // still observe clobbers of Z-registers by clobbering
13052 // the lower 128bits of those registers.
13053 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13054 !Subtarget->isSVEorStreamingSVEAvailable())
13055 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13056 &AArch64::FPR128RegClass);
13057 return *P;
13058 }
13059 if (const auto PC = parsePredicateConstraint(Constraint))
13060 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13061 return std::make_pair(0U, RegClass);
13062
13063 if (const auto RGC = parseReducedGprConstraint(Constraint))
13064 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13065 return std::make_pair(0U, RegClass);
13066 }
13067 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13069 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13070
13071 if (Constraint == "{za}") {
13072 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13073 }
13074
13075 if (Constraint == "{zt0}") {
13076 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13077 }
13078
13079 // Use the default implementation in TargetLowering to convert the register
13080 // constraint into a member of a register class.
13081 std::pair<unsigned, const TargetRegisterClass *> Res;
13083
13084 // Not found as a standard register?
13085 if (!Res.second) {
13086 unsigned Size = Constraint.size();
13087 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13088 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13089 int RegNo;
13090 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13091 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13092 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13093 // By default we'll emit v0-v31 for this unless there's a modifier where
13094 // we'll emit the correct register as well.
13095 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13096 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13097 Res.second = &AArch64::FPR64RegClass;
13098 } else {
13099 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13100 Res.second = &AArch64::FPR128RegClass;
13101 }
13102 }
13103 }
13104 }
13105
13106 if (Res.second && !Subtarget->hasFPARMv8() &&
13107 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13108 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13109 return std::make_pair(0U, nullptr);
13110
13111 return Res;
13112}
13113
13115 llvm::Type *Ty,
13116 bool AllowUnknown) const {
13117 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13118 return EVT(MVT::i64x8);
13119
13120 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13121}
13122
13123/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13124/// vector. If it is invalid, don't add anything to Ops.
13125void AArch64TargetLowering::LowerAsmOperandForConstraint(
13126 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13127 SelectionDAG &DAG) const {
13128 SDValue Result;
13129
13130 // Currently only support length 1 constraints.
13131 if (Constraint.size() != 1)
13132 return;
13133
13134 char ConstraintLetter = Constraint[0];
13135 switch (ConstraintLetter) {
13136 default:
13137 break;
13138
13139 // This set of constraints deal with valid constants for various instructions.
13140 // Validate and return a target constant for them if we can.
13141 case 'z': {
13142 // 'z' maps to xzr or wzr so it needs an input of 0.
13143 if (!isNullConstant(Op))
13144 return;
13145
13146 if (Op.getValueType() == MVT::i64)
13147 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13148 else
13149 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13150 break;
13151 }
13152 case 'S':
13153 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13154 // supported for PIC while "s" isn't, making "s" less useful. We implement
13155 // "S" but not "s".
13157 break;
13158
13159 case 'I':
13160 case 'J':
13161 case 'K':
13162 case 'L':
13163 case 'M':
13164 case 'N':
13166 if (!C)
13167 return;
13168
13169 // Grab the value and do some validation.
13170 uint64_t CVal = C->getZExtValue();
13171 switch (ConstraintLetter) {
13172 // The I constraint applies only to simple ADD or SUB immediate operands:
13173 // i.e. 0 to 4095 with optional shift by 12
13174 // The J constraint applies only to ADD or SUB immediates that would be
13175 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13176 // instruction [or vice versa], in other words -1 to -4095 with optional
13177 // left shift by 12.
13178 case 'I':
13179 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13180 break;
13181 return;
13182 case 'J': {
13183 uint64_t NVal = -C->getSExtValue();
13184 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13185 CVal = C->getSExtValue();
13186 break;
13187 }
13188 return;
13189 }
13190 // The K and L constraints apply *only* to logical immediates, including
13191 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13192 // been removed and MOV should be used). So these constraints have to
13193 // distinguish between bit patterns that are valid 32-bit or 64-bit
13194 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13195 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13196 // versa.
13197 case 'K':
13198 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13199 break;
13200 return;
13201 case 'L':
13202 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13203 break;
13204 return;
13205 // The M and N constraints are a superset of K and L respectively, for use
13206 // with the MOV (immediate) alias. As well as the logical immediates they
13207 // also match 32 or 64-bit immediates that can be loaded either using a
13208 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13209 // (M) or 64-bit 0x1234000000000000 (N) etc.
13210 // As a note some of this code is liberally stolen from the asm parser.
13211 case 'M': {
13212 if (!isUInt<32>(CVal))
13213 return;
13214 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13215 break;
13216 if ((CVal & 0xFFFF) == CVal)
13217 break;
13218 if ((CVal & 0xFFFF0000ULL) == CVal)
13219 break;
13220 uint64_t NCVal = ~(uint32_t)CVal;
13221 if ((NCVal & 0xFFFFULL) == NCVal)
13222 break;
13223 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13224 break;
13225 return;
13226 }
13227 case 'N': {
13228 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13229 break;
13230 if ((CVal & 0xFFFFULL) == CVal)
13231 break;
13232 if ((CVal & 0xFFFF0000ULL) == CVal)
13233 break;
13234 if ((CVal & 0xFFFF00000000ULL) == CVal)
13235 break;
13236 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13237 break;
13238 uint64_t NCVal = ~CVal;
13239 if ((NCVal & 0xFFFFULL) == NCVal)
13240 break;
13241 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13242 break;
13243 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13244 break;
13245 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13246 break;
13247 return;
13248 }
13249 default:
13250 return;
13251 }
13252
13253 // All assembler immediates are 64-bit integers.
13254 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13255 break;
13256 }
13257
13258 if (Result.getNode()) {
13259 Ops.push_back(Result);
13260 return;
13261 }
13262
13263 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13264}
13265
13266//===----------------------------------------------------------------------===//
13267// AArch64 Advanced SIMD Support
13268//===----------------------------------------------------------------------===//
13269
13270/// WidenVector - Given a value in the V64 register class, produce the
13271/// equivalent value in the V128 register class.
13273 EVT VT = V64Reg.getValueType();
13274 unsigned NarrowSize = VT.getVectorNumElements();
13275 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13276 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13277 SDLoc DL(V64Reg);
13278
13279 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13280 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13281}
13282
13283/// getExtFactor - Determine the adjustment factor for the position when
13284/// generating an "extract from vector registers" instruction.
13285static unsigned getExtFactor(SDValue &V) {
13286 EVT EltType = V.getValueType().getVectorElementType();
13287 return EltType.getSizeInBits() / 8;
13288}
13289
13290// Check if a vector is built from one vector via extracted elements of
13291// another together with an AND mask, ensuring that all elements fit
13292// within range. This can be reconstructed using AND and NEON's TBL1.
13294 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13295 SDLoc DL(Op);
13296 EVT VT = Op.getValueType();
13297 assert(!VT.isScalableVector() &&
13298 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13299
13300 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13301 // directly to TBL1.
13302 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13303 return SDValue();
13304
13305 unsigned NumElts = VT.getVectorNumElements();
13306 assert((NumElts == 8 || NumElts == 16) &&
13307 "Need to have exactly 8 or 16 elements in vector.");
13308
13309 SDValue SourceVec;
13310 SDValue MaskSourceVec;
13311 SmallVector<SDValue, 16> AndMaskConstants;
13312
13313 for (unsigned i = 0; i < NumElts; ++i) {
13314 SDValue V = Op.getOperand(i);
13315 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13316 return SDValue();
13317
13318 SDValue OperandSourceVec = V.getOperand(0);
13319 if (!SourceVec)
13320 SourceVec = OperandSourceVec;
13321 else if (SourceVec != OperandSourceVec)
13322 return SDValue();
13323
13324 // This only looks at shuffles with elements that are
13325 // a) truncated by a constant AND mask extracted from a mask vector, or
13326 // b) extracted directly from a mask vector.
13327 SDValue MaskSource = V.getOperand(1);
13328 if (MaskSource.getOpcode() == ISD::AND) {
13329 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13330 return SDValue();
13331
13332 AndMaskConstants.push_back(MaskSource.getOperand(1));
13333 MaskSource = MaskSource->getOperand(0);
13334 } else if (!AndMaskConstants.empty()) {
13335 // Either all or no operands should have an AND mask.
13336 return SDValue();
13337 }
13338
13339 // An ANY_EXTEND may be inserted between the AND and the source vector
13340 // extraction. We don't care about that, so we can just skip it.
13341 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13342 MaskSource = MaskSource.getOperand(0);
13343
13344 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13345 return SDValue();
13346
13347 SDValue MaskIdx = MaskSource.getOperand(1);
13348 if (!isa<ConstantSDNode>(MaskIdx) ||
13349 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13350 return SDValue();
13351
13352 // We only apply this if all elements come from the same vector with the
13353 // same vector type.
13354 if (!MaskSourceVec) {
13355 MaskSourceVec = MaskSource->getOperand(0);
13356 if (MaskSourceVec.getValueType() != VT)
13357 return SDValue();
13358 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13359 return SDValue();
13360 }
13361 }
13362
13363 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13364 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13365 // insert, we know that the index in the mask must be smaller than the number
13366 // of elements in the source, or we would have an out-of-bounds access.
13367 if (NumElts == 8)
13368 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13369 DAG.getUNDEF(VT));
13370
13371 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13372 if (!AndMaskConstants.empty())
13373 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13374 DAG.getBuildVector(VT, DL, AndMaskConstants));
13375
13376 return DAG.getNode(
13378 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
13379 MaskSourceVec);
13380}
13381
13382// Gather data to see if the operation can be modelled as a
13383// shuffle in combination with VEXTs.
13385 SelectionDAG &DAG) const {
13386 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13387 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13388 SDLoc DL(Op);
13389 EVT VT = Op.getValueType();
13390 assert(!VT.isScalableVector() &&
13391 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13392 unsigned NumElts = VT.getVectorNumElements();
13393
13394 struct ShuffleSourceInfo {
13395 SDValue Vec;
13396 unsigned MinElt;
13397 unsigned MaxElt;
13398
13399 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13400 // be compatible with the shuffle we intend to construct. As a result
13401 // ShuffleVec will be some sliding window into the original Vec.
13402 SDValue ShuffleVec;
13403
13404 // Code should guarantee that element i in Vec starts at element "WindowBase
13405 // + i * WindowScale in ShuffleVec".
13406 int WindowBase;
13407 int WindowScale;
13408
13409 ShuffleSourceInfo(SDValue Vec)
13410 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13411 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13412
13413 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13414 };
13415
13416 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13417 // node.
13419 for (unsigned i = 0; i < NumElts; ++i) {
13420 SDValue V = Op.getOperand(i);
13421 if (V.isUndef())
13422 continue;
13423 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13424 !isa<ConstantSDNode>(V.getOperand(1)) ||
13425 V.getOperand(0).getValueType().isScalableVector()) {
13426 LLVM_DEBUG(
13427 dbgs() << "Reshuffle failed: "
13428 "a shuffle can only come from building a vector from "
13429 "various elements of other fixed-width vectors, provided "
13430 "their indices are constant\n");
13431 return SDValue();
13432 }
13433
13434 // Add this element source to the list if it's not already there.
13435 SDValue SourceVec = V.getOperand(0);
13436 auto Source = find(Sources, SourceVec);
13437 if (Source == Sources.end())
13438 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13439
13440 // Update the minimum and maximum lane number seen.
13441 unsigned EltNo = V.getConstantOperandVal(1);
13442 Source->MinElt = std::min(Source->MinElt, EltNo);
13443 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13444 }
13445
13446 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13447 // better than moving to/from gpr registers for larger vectors.
13448 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13449 // Construct a mask for the tbl. We may need to adjust the index for types
13450 // larger than i8.
13452 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13453 for (unsigned I = 0; I < NumElts; ++I) {
13454 SDValue V = Op.getOperand(I);
13455 if (V.isUndef()) {
13456 for (unsigned OF = 0; OF < OutputFactor; OF++)
13457 Mask.push_back(-1);
13458 continue;
13459 }
13460 // Set the Mask lanes adjusted for the size of the input and output
13461 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13462 // output element, adjusted in their positions per input and output types.
13463 unsigned Lane = V.getConstantOperandVal(1);
13464 for (unsigned S = 0; S < Sources.size(); S++) {
13465 if (V.getOperand(0) == Sources[S].Vec) {
13466 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13467 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13468 for (unsigned OF = 0; OF < OutputFactor; OF++)
13469 Mask.push_back(InputBase + OF);
13470 break;
13471 }
13472 }
13473 }
13474
13475 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13476 // v16i8, and the TBLMask
13477 SmallVector<SDValue, 16> TBLOperands;
13478 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13479 ? Intrinsic::aarch64_neon_tbl3
13480 : Intrinsic::aarch64_neon_tbl4,
13481 DL, MVT::i32));
13482 for (unsigned i = 0; i < Sources.size(); i++) {
13483 SDValue Src = Sources[i].Vec;
13484 EVT SrcVT = Src.getValueType();
13485 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13486 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13487 "Expected a legally typed vector");
13488 if (SrcVT.is64BitVector())
13489 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13490 DAG.getUNDEF(MVT::v8i8));
13491 TBLOperands.push_back(Src);
13492 }
13493
13495 for (unsigned i = 0; i < Mask.size(); i++)
13496 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13497 assert((Mask.size() == 8 || Mask.size() == 16) &&
13498 "Expected a v8i8 or v16i8 Mask");
13499 TBLOperands.push_back(DAG.getBuildVector(
13500 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13501
13502 SDValue Shuffle =
13504 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13505 return DAG.getBitcast(VT, Shuffle);
13506 }
13507
13508 if (Sources.size() > 2) {
13509 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13510 << "sensible when at most two source vectors are "
13511 << "involved\n");
13512 return SDValue();
13513 }
13514
13515 // Find out the smallest element size among result and two sources, and use
13516 // it as element size to build the shuffle_vector.
13517 EVT SmallestEltTy = VT.getVectorElementType();
13518 for (auto &Source : Sources) {
13519 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13520 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13521 SmallestEltTy = SrcEltTy;
13522 }
13523 }
13524 unsigned ResMultiplier =
13525 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13526 uint64_t VTSize = VT.getFixedSizeInBits();
13527 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13528 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13529
13530 // If the source vector is too wide or too narrow, we may nevertheless be able
13531 // to construct a compatible shuffle either by concatenating it with UNDEF or
13532 // extracting a suitable range of elements.
13533 for (auto &Src : Sources) {
13534 EVT SrcVT = Src.ShuffleVec.getValueType();
13535
13536 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13537 if (SrcVTSize == TypeSize::getFixed(VTSize))
13538 continue;
13539
13540 // This stage of the search produces a source with the same element type as
13541 // the original, but with a total width matching the BUILD_VECTOR output.
13542 EVT EltVT = SrcVT.getVectorElementType();
13543 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13544 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13545
13546 if (SrcVTSize.getFixedValue() < VTSize) {
13547 assert(2 * SrcVTSize == VTSize);
13548 // We can pad out the smaller vector for free, so if it's part of a
13549 // shuffle...
13550 Src.ShuffleVec =
13551 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13552 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13553 continue;
13554 }
13555
13556 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13557 LLVM_DEBUG(
13558 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13559 return SDValue();
13560 }
13561
13562 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13563 LLVM_DEBUG(
13564 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13565 return SDValue();
13566 }
13567
13568 if (Src.MinElt >= NumSrcElts) {
13569 // The extraction can just take the second half
13570 Src.ShuffleVec =
13571 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13572 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13573 Src.WindowBase = -NumSrcElts;
13574 } else if (Src.MaxElt < NumSrcElts) {
13575 // The extraction can just take the first half
13576 Src.ShuffleVec =
13577 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13578 DAG.getConstant(0, DL, MVT::i64));
13579 } else {
13580 // An actual VEXT is needed
13581 SDValue VEXTSrc1 =
13582 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13583 DAG.getConstant(0, DL, MVT::i64));
13584 SDValue VEXTSrc2 =
13585 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13586 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13587 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13588
13589 if (!SrcVT.is64BitVector()) {
13590 LLVM_DEBUG(
13591 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13592 "for SVE vectors.");
13593 return SDValue();
13594 }
13595
13596 Src.ShuffleVec =
13597 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13598 DAG.getConstant(Imm, DL, MVT::i32));
13599 Src.WindowBase = -Src.MinElt;
13600 }
13601 }
13602
13603 // Another possible incompatibility occurs from the vector element types. We
13604 // can fix this by bitcasting the source vectors to the same type we intend
13605 // for the shuffle.
13606 for (auto &Src : Sources) {
13607 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13608 if (SrcEltTy == SmallestEltTy)
13609 continue;
13610 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13611 if (DAG.getDataLayout().isBigEndian()) {
13612 Src.ShuffleVec =
13613 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13614 } else {
13615 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13616 }
13617 Src.WindowScale =
13618 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13619 Src.WindowBase *= Src.WindowScale;
13620 }
13621
13622 // Final check before we try to actually produce a shuffle.
13623 LLVM_DEBUG({
13624 for (auto Src : Sources)
13625 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13626 });
13627
13628 // The stars all align, our next step is to produce the mask for the shuffle.
13629 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13630 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13631 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13632 SDValue Entry = Op.getOperand(i);
13633 if (Entry.isUndef())
13634 continue;
13635
13636 auto Src = find(Sources, Entry.getOperand(0));
13637 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13638
13639 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13640 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13641 // segment.
13642 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13643 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13644 VT.getScalarSizeInBits());
13645 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13646
13647 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13648 // starting at the appropriate offset.
13649 int *LaneMask = &Mask[i * ResMultiplier];
13650
13651 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13652 ExtractBase += NumElts * (Src - Sources.begin());
13653 for (int j = 0; j < LanesDefined; ++j)
13654 LaneMask[j] = ExtractBase + j;
13655 }
13656
13657 // Final check before we try to produce nonsense...
13658 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13659 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13660 return SDValue();
13661 }
13662
13663 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13664 for (unsigned i = 0; i < Sources.size(); ++i)
13665 ShuffleOps[i] = Sources[i].ShuffleVec;
13666
13667 SDValue Shuffle =
13668 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13669 SDValue V;
13670 if (DAG.getDataLayout().isBigEndian()) {
13671 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13672 } else {
13673 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13674 }
13675
13676 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13677 dbgs() << "Reshuffle, creating node: "; V.dump(););
13678
13679 return V;
13680}
13681
13682// check if an EXT instruction can handle the shuffle mask when the
13683// vector sources of the shuffle are the same.
13684static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13685 unsigned NumElts = VT.getVectorNumElements();
13686
13687 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13688 if (M[0] < 0)
13689 return false;
13690
13691 Imm = M[0];
13692
13693 // If this is a VEXT shuffle, the immediate value is the index of the first
13694 // element. The other shuffle indices must be the successive elements after
13695 // the first one.
13696 unsigned ExpectedElt = Imm;
13697 for (unsigned i = 1; i < NumElts; ++i) {
13698 // Increment the expected index. If it wraps around, just follow it
13699 // back to index zero and keep going.
13700 ++ExpectedElt;
13701 if (ExpectedElt == NumElts)
13702 ExpectedElt = 0;
13703
13704 if (M[i] < 0)
13705 continue; // ignore UNDEF indices
13706 if (ExpectedElt != static_cast<unsigned>(M[i]))
13707 return false;
13708 }
13709
13710 return true;
13711}
13712
13713// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13714// v4i32s. This is really a truncate, which we can construct out of (legal)
13715// concats and truncate nodes.
13717 if (V.getValueType() != MVT::v16i8)
13718 return SDValue();
13719 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13720
13721 for (unsigned X = 0; X < 4; X++) {
13722 // Check the first item in each group is an extract from lane 0 of a v4i32
13723 // or v4i16.
13724 SDValue BaseExt = V.getOperand(X * 4);
13725 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13726 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13727 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13728 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13729 BaseExt.getConstantOperandVal(1) != 0)
13730 return SDValue();
13731 SDValue Base = BaseExt.getOperand(0);
13732 // And check the other items are extracts from the same vector.
13733 for (unsigned Y = 1; Y < 4; Y++) {
13734 SDValue Ext = V.getOperand(X * 4 + Y);
13735 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13736 Ext.getOperand(0) != Base ||
13737 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13738 Ext.getConstantOperandVal(1) != Y)
13739 return SDValue();
13740 }
13741 }
13742
13743 // Turn the buildvector into a series of truncates and concates, which will
13744 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13745 // concat together to produce 2 v8i16. These are both truncated and concat
13746 // together.
13747 SDLoc DL(V);
13748 SDValue Trunc[4] = {
13749 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13750 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13751 for (SDValue &V : Trunc)
13752 if (V.getValueType() == MVT::v4i32)
13753 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13754 SDValue Concat0 =
13755 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13756 SDValue Concat1 =
13757 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13758 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13759 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13760 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13761}
13762
13763/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13764/// element width than the vector lane type. If that is the case the function
13765/// returns true and writes the value of the DUP instruction lane operand into
13766/// DupLaneOp
13767static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13768 unsigned &DupLaneOp) {
13769 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13770 "Only possible block sizes for wide DUP are: 16, 32, 64");
13771
13772 if (BlockSize <= VT.getScalarSizeInBits())
13773 return false;
13774 if (BlockSize % VT.getScalarSizeInBits() != 0)
13775 return false;
13776 if (VT.getSizeInBits() % BlockSize != 0)
13777 return false;
13778
13779 size_t SingleVecNumElements = VT.getVectorNumElements();
13780 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13781 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13782
13783 // We are looking for masks like
13784 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13785 // might be replaced by 'undefined'. BlockIndices will eventually contain
13786 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13787 // for the above examples)
13788 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13789 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13790 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13791 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13792 if (Elt < 0)
13793 continue;
13794 // For now we don't support shuffles that use the second operand
13795 if ((unsigned)Elt >= SingleVecNumElements)
13796 return false;
13797 if (BlockElts[I] < 0)
13798 BlockElts[I] = Elt;
13799 else if (BlockElts[I] != Elt)
13800 return false;
13801 }
13802
13803 // We found a candidate block (possibly with some undefs). It must be a
13804 // sequence of consecutive integers starting with a value divisible by
13805 // NumEltsPerBlock with some values possibly replaced by undef-s.
13806
13807 // Find first non-undef element
13808 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13809 assert(FirstRealEltIter != BlockElts.end() &&
13810 "Shuffle with all-undefs must have been caught by previous cases, "
13811 "e.g. isSplat()");
13812 if (FirstRealEltIter == BlockElts.end()) {
13813 DupLaneOp = 0;
13814 return true;
13815 }
13816
13817 // Index of FirstRealElt in BlockElts
13818 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13819
13820 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13821 return false;
13822 // BlockElts[0] must have the following value if it isn't undef:
13823 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13824
13825 // Check the first element
13826 if (Elt0 % NumEltsPerBlock != 0)
13827 return false;
13828 // Check that the sequence indeed consists of consecutive integers (modulo
13829 // undefs)
13830 for (size_t I = 0; I < NumEltsPerBlock; I++)
13831 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13832 return false;
13833
13834 DupLaneOp = Elt0 / NumEltsPerBlock;
13835 return true;
13836}
13837
13838// check if an EXT instruction can handle the shuffle mask when the
13839// vector sources of the shuffle are different.
13840static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13841 unsigned &Imm) {
13842 // Look for the first non-undef element.
13843 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13844
13845 // Benefit from APInt to handle overflow when calculating expected element.
13846 unsigned NumElts = VT.getVectorNumElements();
13847 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13848 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13849 /*implicitTrunc=*/true);
13850 // The following shuffle indices must be the successive elements after the
13851 // first real element.
13852 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13853 return Elt != ExpectedElt++ && Elt >= 0;
13854 });
13855 if (FoundWrongElt)
13856 return false;
13857
13858 // The index of an EXT is the first element if it is not UNDEF.
13859 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13860 // value of the first element. E.g.
13861 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13862 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13863 // ExpectedElt is the last mask index plus 1.
13864 Imm = ExpectedElt.getZExtValue();
13865
13866 // There are two difference cases requiring to reverse input vectors.
13867 // For example, for vector <4 x i32> we have the following cases,
13868 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13869 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13870 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13871 // to reverse two input vectors.
13872 if (Imm < NumElts)
13873 ReverseEXT = true;
13874 else
13875 Imm -= NumElts;
13876
13877 return true;
13878}
13879
13880/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13881/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13882/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13883static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13884 unsigned NumElts = VT.getVectorNumElements();
13885 if (NumElts % 2 != 0)
13886 return false;
13887 WhichResult = (M[0] == 0 ? 0 : 1);
13888 unsigned Idx = WhichResult * NumElts / 2;
13889 for (unsigned i = 0; i != NumElts; i += 2) {
13890 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13891 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13892 return false;
13893 Idx += 1;
13894 }
13895
13896 return true;
13897}
13898
13899/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13900/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13901/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13902static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13903 unsigned Half = VT.getVectorNumElements() / 2;
13904 WhichResult = (M[0] == 0 ? 0 : 1);
13905 for (unsigned j = 0; j != 2; ++j) {
13906 unsigned Idx = WhichResult;
13907 for (unsigned i = 0; i != Half; ++i) {
13908 int MIdx = M[i + j * Half];
13909 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13910 return false;
13911 Idx += 2;
13912 }
13913 }
13914
13915 return true;
13916}
13917
13918/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13919/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13920/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13921static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13922 unsigned NumElts = VT.getVectorNumElements();
13923 if (NumElts % 2 != 0)
13924 return false;
13925 WhichResult = (M[0] == 0 ? 0 : 1);
13926 for (unsigned i = 0; i < NumElts; i += 2) {
13927 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13928 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13929 return false;
13930 }
13931 return true;
13932}
13933
13934static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13935 bool &DstIsLeft, int &Anomaly) {
13936 if (M.size() != static_cast<size_t>(NumInputElements))
13937 return false;
13938
13939 int NumLHSMatch = 0, NumRHSMatch = 0;
13940 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13941
13942 for (int i = 0; i < NumInputElements; ++i) {
13943 if (M[i] == -1) {
13944 ++NumLHSMatch;
13945 ++NumRHSMatch;
13946 continue;
13947 }
13948
13949 if (M[i] == i)
13950 ++NumLHSMatch;
13951 else
13952 LastLHSMismatch = i;
13953
13954 if (M[i] == i + NumInputElements)
13955 ++NumRHSMatch;
13956 else
13957 LastRHSMismatch = i;
13958 }
13959
13960 if (NumLHSMatch == NumInputElements - 1) {
13961 DstIsLeft = true;
13962 Anomaly = LastLHSMismatch;
13963 return true;
13964 } else if (NumRHSMatch == NumInputElements - 1) {
13965 DstIsLeft = false;
13966 Anomaly = LastRHSMismatch;
13967 return true;
13968 }
13969
13970 return false;
13971}
13972
13973static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13974 if (VT.getSizeInBits() != 128)
13975 return false;
13976
13977 unsigned NumElts = VT.getVectorNumElements();
13978
13979 for (int I = 0, E = NumElts / 2; I != E; I++) {
13980 if (Mask[I] != I)
13981 return false;
13982 }
13983
13984 int Offset = NumElts / 2;
13985 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13986 if (Mask[I] != I + SplitLHS * Offset)
13987 return false;
13988 }
13989
13990 return true;
13991}
13992
13994 SDLoc DL(Op);
13995 EVT VT = Op.getValueType();
13996 SDValue V0 = Op.getOperand(0);
13997 SDValue V1 = Op.getOperand(1);
13998 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13999
14002 return SDValue();
14003
14004 bool SplitV0 = V0.getValueSizeInBits() == 128;
14005
14006 if (!isConcatMask(Mask, VT, SplitV0))
14007 return SDValue();
14008
14009 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14010 if (SplitV0) {
14011 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14012 DAG.getConstant(0, DL, MVT::i64));
14013 }
14014 if (V1.getValueSizeInBits() == 128) {
14015 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14016 DAG.getConstant(0, DL, MVT::i64));
14017 }
14018 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14019}
14020
14021/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14022/// the specified operations to build the shuffle. ID is the perfect-shuffle
14023//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14024//table entry and LHS/RHS are the immediate inputs for this stage of the
14025//shuffle.
14027 unsigned PFEntry, SDValue LHS,
14028 SDValue RHS, SelectionDAG &DAG,
14029 const SDLoc &DL) {
14030 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14031 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14032 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14033
14034 enum {
14035 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14036 OP_VREV,
14037 OP_VDUP0,
14038 OP_VDUP1,
14039 OP_VDUP2,
14040 OP_VDUP3,
14041 OP_VEXT1,
14042 OP_VEXT2,
14043 OP_VEXT3,
14044 OP_VUZPL, // VUZP, left result
14045 OP_VUZPR, // VUZP, right result
14046 OP_VZIPL, // VZIP, left result
14047 OP_VZIPR, // VZIP, right result
14048 OP_VTRNL, // VTRN, left result
14049 OP_VTRNR, // VTRN, right result
14050 OP_MOVLANE // Move lane. RHSID is the lane to move into
14051 };
14052
14053 if (OpNum == OP_COPY) {
14054 if (LHSID == (1 * 9 + 2) * 9 + 3)
14055 return LHS;
14056 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14057 return RHS;
14058 }
14059
14060 if (OpNum == OP_MOVLANE) {
14061 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14062 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14063 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14064 Elt = 3 - Elt;
14065 while (Elt > 0) {
14066 ID /= 9;
14067 Elt--;
14068 }
14069 return (ID % 9 == 8) ? -1 : ID % 9;
14070 };
14071
14072 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14073 // get the lane to move from the PFID, which is always from the
14074 // original vectors (V1 or V2).
14076 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14077 EVT VT = OpLHS.getValueType();
14078 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14079 unsigned ExtLane = 0;
14080 SDValue Input;
14081
14082 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14083 // convert into a higher type.
14084 if (RHSID & 0x4) {
14085 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14086 if (MaskElt == -1)
14087 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14088 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14089 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14090 Input = MaskElt < 2 ? V1 : V2;
14091 if (VT.getScalarSizeInBits() == 16) {
14092 Input = DAG.getBitcast(MVT::v2f32, Input);
14093 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14094 } else {
14095 assert(VT.getScalarSizeInBits() == 32 &&
14096 "Expected 16 or 32 bit shuffle elements");
14097 Input = DAG.getBitcast(MVT::v2f64, Input);
14098 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14099 }
14100 } else {
14101 int MaskElt = getPFIDLane(ID, RHSID);
14102 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14103 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14104 Input = MaskElt < 4 ? V1 : V2;
14105 // Be careful about creating illegal types. Use f16 instead of i16.
14106 if (VT == MVT::v4i16) {
14107 Input = DAG.getBitcast(MVT::v4f16, Input);
14108 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14109 }
14110 }
14112 Input.getValueType().getVectorElementType(),
14113 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14114 SDValue Ins =
14115 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14116 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14117 return DAG.getBitcast(VT, Ins);
14118 }
14119
14120 SDValue OpLHS, OpRHS;
14121 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14122 RHS, DAG, DL);
14123 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14124 RHS, DAG, DL);
14125 EVT VT = OpLHS.getValueType();
14126
14127 switch (OpNum) {
14128 default:
14129 llvm_unreachable("Unknown shuffle opcode!");
14130 case OP_VREV:
14131 // VREV divides the vector in half and swaps within the half.
14132 if (VT.getVectorElementType() == MVT::i32 ||
14133 VT.getVectorElementType() == MVT::f32)
14134 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14135 // vrev <4 x i16> -> REV32
14136 if (VT.getVectorElementType() == MVT::i16 ||
14137 VT.getVectorElementType() == MVT::f16 ||
14138 VT.getVectorElementType() == MVT::bf16)
14139 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14140 // vrev <4 x i8> -> REV16
14141 assert(VT.getVectorElementType() == MVT::i8);
14142 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14143 case OP_VDUP0:
14144 case OP_VDUP1:
14145 case OP_VDUP2:
14146 case OP_VDUP3: {
14147 EVT EltTy = VT.getVectorElementType();
14148 unsigned Opcode;
14149 if (EltTy == MVT::i8)
14150 Opcode = AArch64ISD::DUPLANE8;
14151 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14152 Opcode = AArch64ISD::DUPLANE16;
14153 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14154 Opcode = AArch64ISD::DUPLANE32;
14155 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14156 Opcode = AArch64ISD::DUPLANE64;
14157 else
14158 llvm_unreachable("Invalid vector element type?");
14159
14160 if (VT.getSizeInBits() == 64)
14161 OpLHS = WidenVector(OpLHS, DAG);
14162 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14163 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14164 }
14165 case OP_VEXT1:
14166 case OP_VEXT2:
14167 case OP_VEXT3: {
14168 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14169 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14170 DAG.getConstant(Imm, DL, MVT::i32));
14171 }
14172 case OP_VUZPL:
14173 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14174 case OP_VUZPR:
14175 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14176 case OP_VZIPL:
14177 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14178 case OP_VZIPR:
14179 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14180 case OP_VTRNL:
14181 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14182 case OP_VTRNR:
14183 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14184 }
14185}
14186
14188 SelectionDAG &DAG) {
14189 // Check to see if we can use the TBL instruction.
14190 SDValue V1 = Op.getOperand(0);
14191 SDValue V2 = Op.getOperand(1);
14192 SDLoc DL(Op);
14193
14194 EVT EltVT = Op.getValueType().getVectorElementType();
14195 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14196
14197 bool Swap = false;
14198 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14199 std::swap(V1, V2);
14200 Swap = true;
14201 }
14202
14203 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14204 // out of range values with 0s. We do need to make sure that any out-of-range
14205 // values are really out-of-range for a v16i8 vector.
14206 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14207 MVT IndexVT = MVT::v8i8;
14208 unsigned IndexLen = 8;
14209 if (Op.getValueSizeInBits() == 128) {
14210 IndexVT = MVT::v16i8;
14211 IndexLen = 16;
14212 }
14213
14215 for (int Val : ShuffleMask) {
14216 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14217 unsigned Offset = Byte + Val * BytesPerElt;
14218 if (Swap)
14219 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14220 if (IsUndefOrZero && Offset >= IndexLen)
14221 Offset = 255;
14222 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14223 }
14224 }
14225
14226 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14227 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14228
14229 SDValue Shuffle;
14230 if (IsUndefOrZero) {
14231 if (IndexLen == 8)
14232 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14233 Shuffle = DAG.getNode(
14234 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14235 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14236 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14237 } else {
14238 if (IndexLen == 8) {
14239 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14240 Shuffle = DAG.getNode(
14241 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14242 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14243 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14244 } else {
14245 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14246 // cannot currently represent the register constraints on the input
14247 // table registers.
14248 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14249 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14250 // IndexLen));
14251 Shuffle = DAG.getNode(
14252 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14253 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
14254 V2Cst,
14255 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14256 }
14257 }
14258 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14259}
14260
14261static unsigned getDUPLANEOp(EVT EltType) {
14262 if (EltType == MVT::i8)
14263 return AArch64ISD::DUPLANE8;
14264 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14265 return AArch64ISD::DUPLANE16;
14266 if (EltType == MVT::i32 || EltType == MVT::f32)
14267 return AArch64ISD::DUPLANE32;
14268 if (EltType == MVT::i64 || EltType == MVT::f64)
14269 return AArch64ISD::DUPLANE64;
14270
14271 llvm_unreachable("Invalid vector element type?");
14272}
14273
14274static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14275 unsigned Opcode, SelectionDAG &DAG) {
14276 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14277 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14278 // Match: dup (bitcast (extract_subv X, C)), LaneC
14279 if (BitCast.getOpcode() != ISD::BITCAST ||
14281 return false;
14282
14283 // The extract index must align in the destination type. That may not
14284 // happen if the bitcast is from narrow to wide type.
14285 SDValue Extract = BitCast.getOperand(0);
14286 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14287 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14288 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14289 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14290 if (ExtIdxInBits % CastedEltBitWidth != 0)
14291 return false;
14292
14293 // Can't handle cases where vector size is not 128-bit
14294 if (!Extract.getOperand(0).getValueType().is128BitVector())
14295 return false;
14296
14297 // Update the lane value by offsetting with the scaled extract index.
14298 LaneC += ExtIdxInBits / CastedEltBitWidth;
14299
14300 // Determine the casted vector type of the wide vector input.
14301 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14302 // Examples:
14303 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14304 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14305 unsigned SrcVecNumElts =
14306 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14308 SrcVecNumElts);
14309 return true;
14310 };
14311 MVT CastVT;
14312 if (getScaledOffsetDup(V, Lane, CastVT)) {
14313 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14314 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14315 V.getOperand(0).getValueType().is128BitVector()) {
14316 // The lane is incremented by the index of the extract.
14317 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14318 Lane += V.getConstantOperandVal(1);
14319 V = V.getOperand(0);
14320 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14321 // The lane is decremented if we are splatting from the 2nd operand.
14322 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14323 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14324 Lane -= Idx * VT.getVectorNumElements() / 2;
14325 V = WidenVector(V.getOperand(Idx), DAG);
14326 } else if (VT.getSizeInBits() == 64) {
14327 // Widen the operand to 128-bit register with undef.
14328 V = WidenVector(V, DAG);
14329 }
14330 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14331}
14332
14333// Try to widen element type to get a new mask value for a better permutation
14334// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14335// UZP1/2, TRN1/2, REV, INS, etc.
14336// For example:
14337// shufflevector <4 x i32> %a, <4 x i32> %b,
14338// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14339// is equivalent to:
14340// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14341// Finally, we can get:
14342// mov v0.d[0], v1.d[1]
14344 SDLoc DL(Op);
14345 EVT VT = Op.getValueType();
14346 EVT ScalarVT = VT.getVectorElementType();
14347 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14348 SDValue V0 = Op.getOperand(0);
14349 SDValue V1 = Op.getOperand(1);
14350 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14351
14352 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14353 // We need to make sure the wider element type is legal. Thus, ElementSize
14354 // should be not larger than 32 bits, and i1 type should also be excluded.
14355 if (ElementSize > 32 || ElementSize == 1)
14356 return SDValue();
14357
14358 SmallVector<int, 8> NewMask;
14359 if (widenShuffleMaskElts(Mask, NewMask)) {
14360 MVT NewEltVT = VT.isFloatingPoint()
14361 ? MVT::getFloatingPointVT(ElementSize * 2)
14362 : MVT::getIntegerVT(ElementSize * 2);
14363 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14364 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14365 V0 = DAG.getBitcast(NewVT, V0);
14366 V1 = DAG.getBitcast(NewVT, V1);
14367 return DAG.getBitcast(VT,
14368 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14369 }
14370 }
14371
14372 return SDValue();
14373}
14374
14375// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14377 ArrayRef<int> ShuffleMask,
14378 SelectionDAG &DAG) {
14379 SDValue Tbl1 = Op->getOperand(0);
14380 SDValue Tbl2 = Op->getOperand(1);
14381 SDLoc DL(Op);
14382 SDValue Tbl2ID =
14383 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14384
14385 EVT VT = Op.getValueType();
14386 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14387 Tbl1.getOperand(0) != Tbl2ID ||
14389 Tbl2.getOperand(0) != Tbl2ID)
14390 return SDValue();
14391
14392 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14393 return SDValue();
14394
14395 SDValue Mask1 = Tbl1.getOperand(3);
14396 SDValue Mask2 = Tbl2.getOperand(3);
14397 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14398 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14399 return SDValue();
14400
14401 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14402 for (unsigned I = 0; I < 16; I++) {
14403 if (ShuffleMask[I] < 16)
14404 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14405 else {
14406 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14407 if (!C)
14408 return SDValue();
14409 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14410 }
14411 }
14412
14413 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14414 SDValue ID =
14415 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14416
14417 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14418 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14419 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14420}
14421
14422// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14423// but we don't have an appropriate instruction,
14424// so custom-lower it as ZIP1-with-zeros.
14425SDValue
14426AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14427 SelectionDAG &DAG) const {
14428 SDLoc DL(Op);
14429 EVT VT = Op.getValueType();
14430 SDValue SrcOp = Op.getOperand(0);
14431 EVT SrcVT = SrcOp.getValueType();
14432 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14433 "Unexpected extension factor.");
14434 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14435 // FIXME: support multi-step zipping?
14436 if (Scale != 2)
14437 return SDValue();
14438 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14439 return DAG.getBitcast(VT,
14440 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14441}
14442
14443SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14444 SelectionDAG &DAG) const {
14445 SDLoc DL(Op);
14446 EVT VT = Op.getValueType();
14447
14448 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14449
14450 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14451 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14452
14453 // Convert shuffles that are directly supported on NEON to target-specific
14454 // DAG nodes, instead of keeping them as shuffles and matching them again
14455 // during code selection. This is more efficient and avoids the possibility
14456 // of inconsistencies between legalization and selection.
14457 ArrayRef<int> ShuffleMask = SVN->getMask();
14458
14459 SDValue V1 = Op.getOperand(0);
14460 SDValue V2 = Op.getOperand(1);
14461
14462 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14463 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14464 "Unexpected VECTOR_SHUFFLE mask size!");
14465
14466 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14467 return Res;
14468
14469 if (SVN->isSplat()) {
14470 int Lane = SVN->getSplatIndex();
14471 // If this is undef splat, generate it via "just" vdup, if possible.
14472 if (Lane == -1)
14473 Lane = 0;
14474
14475 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14476 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14477 V1.getOperand(0));
14478 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14479 // constant. If so, we can just reference the lane's definition directly.
14480 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14482 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14483
14484 // Otherwise, duplicate from the lane of the input vector.
14485 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14486 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14487 }
14488
14489 // Check if the mask matches a DUP for a wider element
14490 for (unsigned LaneSize : {64U, 32U, 16U}) {
14491 unsigned Lane = 0;
14492 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14493 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14494 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14495 : AArch64ISD::DUPLANE16;
14496 // Cast V1 to an integer vector with required lane size
14497 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14498 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14499 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14500 V1 = DAG.getBitcast(NewVecTy, V1);
14501 // Construct the DUP instruction
14502 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14503 // Cast back to the original type
14504 return DAG.getBitcast(VT, V1);
14505 }
14506 }
14507
14508 unsigned NumElts = VT.getVectorNumElements();
14509 unsigned EltSize = VT.getScalarSizeInBits();
14510 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14511 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14512 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14513 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14514 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14515 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14516
14517 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14518 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14519 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14520 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14521 DAG.getConstant(8, DL, MVT::i32));
14522 }
14523
14524 bool ReverseEXT = false;
14525 unsigned Imm;
14526 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14527 if (ReverseEXT)
14528 std::swap(V1, V2);
14529 Imm *= getExtFactor(V1);
14530 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14531 DAG.getConstant(Imm, DL, MVT::i32));
14532 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14533 Imm *= getExtFactor(V1);
14534 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14535 DAG.getConstant(Imm, DL, MVT::i32));
14536 }
14537
14538 unsigned WhichResult;
14539 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14540 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14541 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14542 }
14543 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14544 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14545 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14546 }
14547 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14548 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14549 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14550 }
14551
14552 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14553 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14554 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14555 }
14556 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14557 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14558 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14559 }
14560 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14561 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14562 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14563 }
14564
14566 return Concat;
14567
14568 bool DstIsLeft;
14569 int Anomaly;
14570 int NumInputElements = V1.getValueType().getVectorNumElements();
14571 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14572 SDValue DstVec = DstIsLeft ? V1 : V2;
14573 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14574
14575 SDValue SrcVec = V1;
14576 int SrcLane = ShuffleMask[Anomaly];
14577 if (SrcLane >= NumInputElements) {
14578 SrcVec = V2;
14579 SrcLane -= NumElts;
14580 }
14581 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14582
14583 EVT ScalarVT = VT.getVectorElementType();
14584
14585 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14586 ScalarVT = MVT::i32;
14587
14588 return DAG.getNode(
14589 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14590 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14591 DstLaneV);
14592 }
14593
14594 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14595 return NewSD;
14596
14597 // If the shuffle is not directly supported and it has 4 elements, use
14598 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14599 if (NumElts == 4) {
14600 unsigned PFIndexes[4];
14601 for (unsigned i = 0; i != 4; ++i) {
14602 if (ShuffleMask[i] < 0)
14603 PFIndexes[i] = 8;
14604 else
14605 PFIndexes[i] = ShuffleMask[i];
14606 }
14607
14608 // Compute the index in the perfect shuffle table.
14609 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14610 PFIndexes[2] * 9 + PFIndexes[3];
14611 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14612 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14613 DL);
14614 }
14615
14616 // Check for a "select shuffle", generating a BSL to pick between lanes in
14617 // V1/V2.
14618 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14619 assert(VT.getScalarSizeInBits() <= 32 &&
14620 "Expected larger vector element sizes to be handled already");
14621 SmallVector<SDValue> MaskElts;
14622 for (int M : ShuffleMask)
14623 MaskElts.push_back(DAG.getConstant(
14624 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14625 EVT IVT = VT.changeVectorElementTypeToInteger();
14626 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14627 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14628 DAG.getBitcast(IVT, V1),
14629 DAG.getBitcast(IVT, V2)));
14630 }
14631
14632 // Fall back to generating a TBL
14633 return GenerateTBL(Op, ShuffleMask, DAG);
14634}
14635
14636SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14637 SelectionDAG &DAG) const {
14638 EVT VT = Op.getValueType();
14639
14640 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14641 return LowerToScalableOp(Op, DAG);
14642
14643 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14644 "Unexpected vector type!");
14645
14646 // We can handle the constant cases during isel.
14647 if (isa<ConstantSDNode>(Op.getOperand(0)))
14648 return Op;
14649
14650 // There isn't a natural way to handle the general i1 case, so we use some
14651 // trickery with whilelo.
14652 SDLoc DL(Op);
14653 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14654 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14655 DAG.getValueType(MVT::i1));
14656 SDValue ID =
14657 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14658 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14659 if (VT == MVT::nxv1i1)
14660 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14661 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14662 Zero, SplatVal),
14663 Zero);
14664 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14665}
14666
14667SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14668 SelectionDAG &DAG) const {
14669 SDLoc DL(Op);
14670
14671 EVT VT = Op.getValueType();
14672 if (!isTypeLegal(VT) || !VT.isScalableVector())
14673 return SDValue();
14674
14675 // Current lowering only supports the SVE-ACLE types.
14677 return SDValue();
14678
14679 // The DUPQ operation is independent of element type so normalise to i64s.
14680 SDValue Idx128 = Op.getOperand(2);
14681
14682 // DUPQ can be used when idx is in range.
14683 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14684 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14685 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14686 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14687 }
14688
14689 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14690
14691 // The ACLE says this must produce the same result as:
14692 // svtbl(data, svadd_x(svptrue_b64(),
14693 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14694 // index * 2))
14695 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14696 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14697
14698 // create the vector 0,1,0,1,...
14699 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14700 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14701
14702 // create the vector idx64,idx64+1,idx64,idx64+1,...
14703 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14704 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14705 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14706
14707 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14708 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14709 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14710}
14711
14712
14713static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14714 APInt &UndefBits) {
14715 EVT VT = BVN->getValueType(0);
14716 APInt SplatBits, SplatUndef;
14717 unsigned SplatBitSize;
14718 bool HasAnyUndefs;
14719 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14720 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14721
14722 for (unsigned i = 0; i < NumSplats; ++i) {
14723 CnstBits <<= SplatBitSize;
14724 UndefBits <<= SplatBitSize;
14725 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14726 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14727 }
14728
14729 return true;
14730 }
14731
14732 return false;
14733}
14734
14735// Try 64-bit splatted SIMD immediate.
14736static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14737 const APInt &Bits) {
14738 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14739 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14740 EVT VT = Op.getValueType();
14741 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14742
14745
14746 SDLoc DL(Op);
14747 SDValue Mov =
14748 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14749 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14750 }
14751 }
14752
14753 return SDValue();
14754}
14755
14756// Try 32-bit splatted SIMD immediate.
14757static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14758 const APInt &Bits,
14759 const SDValue *LHS = nullptr) {
14760 EVT VT = Op.getValueType();
14761 if (VT.isFixedLengthVector() &&
14763 return SDValue();
14764
14765 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14766 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14767 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14768 bool isAdvSIMDModImm = false;
14769 uint64_t Shift;
14770
14771 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14773 Shift = 0;
14774 }
14775 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14777 Shift = 8;
14778 }
14779 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14781 Shift = 16;
14782 }
14783 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14785 Shift = 24;
14786 }
14787
14788 if (isAdvSIMDModImm) {
14789 SDLoc DL(Op);
14790 SDValue Mov;
14791
14792 if (LHS)
14793 Mov = DAG.getNode(NewOp, DL, MovTy,
14794 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14795 DAG.getConstant(Value, DL, MVT::i32),
14796 DAG.getConstant(Shift, DL, MVT::i32));
14797 else
14798 Mov =
14799 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14800 DAG.getConstant(Shift, DL, MVT::i32));
14801
14802 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14803 }
14804 }
14805
14806 return SDValue();
14807}
14808
14809// Try 16-bit splatted SIMD immediate.
14810static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14811 const APInt &Bits,
14812 const SDValue *LHS = nullptr) {
14813 EVT VT = Op.getValueType();
14814 if (VT.isFixedLengthVector() &&
14816 return SDValue();
14817
14818 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14819 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14820 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14821 bool isAdvSIMDModImm = false;
14822 uint64_t Shift;
14823
14824 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14826 Shift = 0;
14827 }
14828 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14830 Shift = 8;
14831 }
14832
14833 if (isAdvSIMDModImm) {
14834 SDLoc DL(Op);
14835 SDValue Mov;
14836
14837 if (LHS)
14838 Mov = DAG.getNode(NewOp, DL, MovTy,
14839 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14840 DAG.getConstant(Value, DL, MVT::i32),
14841 DAG.getConstant(Shift, DL, MVT::i32));
14842 else
14843 Mov =
14844 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14845 DAG.getConstant(Shift, DL, MVT::i32));
14846
14847 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14848 }
14849 }
14850
14851 return SDValue();
14852}
14853
14854// Try 32-bit splatted SIMD immediate with shifted ones.
14856 SelectionDAG &DAG, const APInt &Bits) {
14857 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14858 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14859 EVT VT = Op.getValueType();
14860 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14861 bool isAdvSIMDModImm = false;
14862 uint64_t Shift;
14863
14864 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14866 Shift = 264;
14867 }
14868 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14870 Shift = 272;
14871 }
14872
14873 if (isAdvSIMDModImm) {
14874 SDLoc DL(Op);
14875 SDValue Mov =
14876 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14877 DAG.getConstant(Shift, DL, MVT::i32));
14878 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14879 }
14880 }
14881
14882 return SDValue();
14883}
14884
14885// Try 8-bit splatted SIMD immediate.
14886static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14887 const APInt &Bits) {
14888 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14889 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14890 EVT VT = Op.getValueType();
14891 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14892
14895
14896 SDLoc DL(Op);
14897 SDValue Mov =
14898 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14899 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14900 }
14901 }
14902
14903 return SDValue();
14904}
14905
14906// Try FP splatted SIMD immediate.
14907static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14908 const APInt &Bits) {
14909 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14910 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14911 EVT VT = Op.getValueType();
14912 bool isWide = (VT.getSizeInBits() == 128);
14913 MVT MovTy;
14914 bool isAdvSIMDModImm = false;
14915
14916 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14918 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14919 }
14920 else if (isWide &&
14921 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14923 MovTy = MVT::v2f64;
14924 }
14925
14926 if (isAdvSIMDModImm) {
14927 SDLoc DL(Op);
14928 SDValue Mov =
14929 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14930 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14931 }
14932 }
14933
14934 return SDValue();
14935}
14936
14937// Specialized code to quickly find if PotentialBVec is a BuildVector that
14938// consists of only the same constant int value, returned in reference arg
14939// ConstVal
14940static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14941 uint64_t &ConstVal) {
14942 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14943 if (!Bvec)
14944 return false;
14946 if (!FirstElt)
14947 return false;
14948 EVT VT = Bvec->getValueType(0);
14949 unsigned NumElts = VT.getVectorNumElements();
14950 for (unsigned i = 1; i < NumElts; ++i)
14951 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14952 return false;
14953 ConstVal = FirstElt->getZExtValue();
14954 return true;
14955}
14956
14958 // Look through cast.
14959 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14960 N = N.getOperand(0);
14961
14962 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14963}
14964
14966 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14967
14968 // Look through cast.
14969 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14970 N = N.getOperand(0);
14971 // When reinterpreting from a type with fewer elements the "new" elements
14972 // are not active, so bail if they're likely to be used.
14973 if (N.getValueType().getVectorMinNumElements() < NumElts)
14974 return false;
14975 }
14976
14977 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14978 return true;
14979
14980 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14981 // or smaller than the implicit element type represented by N.
14982 // NOTE: A larger element count implies a smaller element type.
14983 if (N.getOpcode() == AArch64ISD::PTRUE &&
14984 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14985 return N.getValueType().getVectorMinNumElements() >= NumElts;
14986
14987 // If we're compiling for a specific vector-length, we can check if the
14988 // pattern's VL equals that of the scalable vector at runtime.
14989 if (N.getOpcode() == AArch64ISD::PTRUE) {
14990 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14991 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14992 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14993 if (MaxSVESize && MinSVESize == MaxSVESize) {
14994 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14995 unsigned PatNumElts =
14996 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14997 return PatNumElts == (NumElts * VScale);
14998 }
14999 }
15000
15001 return false;
15002}
15003
15004// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15005// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15006// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15007// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15008// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15009// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15011 EVT VT = N->getValueType(0);
15012
15013 if (!VT.isVector())
15014 return SDValue();
15015
15016 SDLoc DL(N);
15017
15018 SDValue And;
15019 SDValue Shift;
15020
15021 SDValue FirstOp = N->getOperand(0);
15022 unsigned FirstOpc = FirstOp.getOpcode();
15023 SDValue SecondOp = N->getOperand(1);
15024 unsigned SecondOpc = SecondOp.getOpcode();
15025
15026 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15027 // a BICi in order to use an immediate instead of a register.
15028 // Is the other operand an shl or lshr? This will have been turned into:
15029 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15030 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15031 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15032 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15033 SecondOpc == AArch64ISD::SHL_PRED ||
15034 SecondOpc == AArch64ISD::SRL_PRED)) {
15035 And = FirstOp;
15036 Shift = SecondOp;
15037
15038 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15039 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15040 FirstOpc == AArch64ISD::SHL_PRED ||
15041 FirstOpc == AArch64ISD::SRL_PRED)) {
15042 And = SecondOp;
15043 Shift = FirstOp;
15044 } else
15045 return SDValue();
15046
15047 bool IsAnd = And.getOpcode() == ISD::AND;
15048 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15049 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15050 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15051 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15052
15053 // Is the shift amount constant and are all lanes active?
15054 uint64_t C2;
15055 if (ShiftHasPredOp) {
15056 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15057 return SDValue();
15058 APInt C;
15060 return SDValue();
15061 C2 = C.getZExtValue();
15062 } else if (ConstantSDNode *C2node =
15064 C2 = C2node->getZExtValue();
15065 else
15066 return SDValue();
15067
15068 APInt C1AsAPInt;
15069 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15070 if (IsAnd) {
15071 // Is the and mask vector all constant?
15072 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15073 return SDValue();
15074 } else {
15075 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15076 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15077 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15078 assert(C1nodeImm && C1nodeShift);
15079 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15080 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15081 }
15082
15083 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15084 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15085 // how much one can shift elements of a particular size?
15086 if (C2 > ElemSizeInBits)
15087 return SDValue();
15088
15089 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15090 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15091 if (C1AsAPInt != RequiredC1)
15092 return SDValue();
15093
15094 SDValue X = And.getOperand(0);
15095 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15096 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15097 : Shift.getOperand(1);
15098
15099 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15100 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
15101
15102 return ResultSLI;
15103}
15104
15106 EVT VT = N->getValueType(0);
15107 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15108 SDLoc DL(N);
15109 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15110
15111 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15112 return SDValue();
15113
15114 SDValue N0 = N->getOperand(0);
15115 if (N0.getOpcode() != ISD::AND)
15116 return SDValue();
15117
15118 SDValue N1 = N->getOperand(1);
15119 if (N1.getOpcode() != ISD::AND)
15120 return SDValue();
15121
15122 // InstCombine does (not (neg a)) => (add a -1).
15123 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15124 // Loop over all combinations of AND operands.
15125 for (int i = 1; i >= 0; --i) {
15126 for (int j = 1; j >= 0; --j) {
15127 SDValue O0 = N0->getOperand(i);
15128 SDValue O1 = N1->getOperand(j);
15129 SDValue Sub, Add, SubSibling, AddSibling;
15130
15131 // Find a SUB and an ADD operand, one from each AND.
15132 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15133 Sub = O0;
15134 Add = O1;
15135 SubSibling = N0->getOperand(1 - i);
15136 AddSibling = N1->getOperand(1 - j);
15137 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15138 Add = O0;
15139 Sub = O1;
15140 AddSibling = N0->getOperand(1 - i);
15141 SubSibling = N1->getOperand(1 - j);
15142 } else
15143 continue;
15144
15145 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15146 continue;
15147
15148 // Constant ones is always righthand operand of the Add.
15149 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15150 continue;
15151
15152 if (Sub.getOperand(1) != Add.getOperand(0))
15153 continue;
15154
15155 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15156 }
15157 }
15158
15159 // (or (and a b) (and (not a) c)) => (bsl a b c)
15160 // We only have to look for constant vectors here since the general, variable
15161 // case can be handled in TableGen.
15162 unsigned Bits = VT.getScalarSizeInBits();
15163 for (int i = 1; i >= 0; --i)
15164 for (int j = 1; j >= 0; --j) {
15165 APInt Val1, Val2;
15166
15167 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15169 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15170 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15171 N0->getOperand(1 - i), N1->getOperand(1 - j));
15172 }
15175 if (!BVN0 || !BVN1)
15176 continue;
15177
15178 bool FoundMatch = true;
15179 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15182 if (!CN0 || !CN1 ||
15183 CN0->getAPIntValue().trunc(Bits) !=
15184 ~CN1->getAsAPIntVal().trunc(Bits)) {
15185 FoundMatch = false;
15186 break;
15187 }
15188 }
15189 if (FoundMatch)
15190 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15191 N0->getOperand(1 - i), N1->getOperand(1 - j));
15192 }
15193
15194 return SDValue();
15195}
15196
15197SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15198 SelectionDAG &DAG) const {
15199 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15200 !Subtarget->isNeonAvailable()))
15201 return LowerToScalableOp(Op, DAG);
15202
15203 if (SDValue Res = tryLowerToBSL(Op, DAG))
15204 return Res;
15205
15206 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15207 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15208 return Res;
15209
15210 EVT VT = Op.getValueType();
15211 if (VT.isScalableVector())
15212 return Op;
15213
15214 SDValue LHS = Op.getOperand(0);
15215 BuildVectorSDNode *BVN =
15216 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15217 if (!BVN) {
15218 // OR commutes, so try swapping the operands.
15219 LHS = Op.getOperand(1);
15220 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15221 }
15222 if (!BVN)
15223 return Op;
15224
15225 APInt DefBits(VT.getSizeInBits(), 0);
15226 APInt UndefBits(VT.getSizeInBits(), 0);
15227 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15228 SDValue NewOp;
15229
15230 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15231 DefBits, &LHS)) ||
15232 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15233 DefBits, &LHS)))
15234 return NewOp;
15235
15236 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15237 UndefBits, &LHS)) ||
15238 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15239 UndefBits, &LHS)))
15240 return NewOp;
15241 }
15242
15243 // We can always fall back to a non-immediate OR.
15244 return Op;
15245}
15246
15247// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15248// be truncated to fit element width.
15250 SelectionDAG &DAG) {
15251 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15252 SDLoc DL(Op);
15253 EVT VT = Op.getValueType();
15254 EVT EltTy= VT.getVectorElementType();
15255
15256 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15257 return Op;
15258
15260 for (SDValue Lane : Op->ops()) {
15261 // For integer vectors, type legalization would have promoted the
15262 // operands already. Otherwise, if Op is a floating-point splat
15263 // (with operands cast to integers), then the only possibilities
15264 // are constants and UNDEFs.
15265 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15266 Lane = DAG.getConstant(
15267 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15268 DL, MVT::i32);
15269 } else if (Lane.getNode()->isUndef()) {
15270 Lane = DAG.getUNDEF(MVT::i32);
15271 } else {
15272 assert(Lane.getValueType() == MVT::i32 &&
15273 "Unexpected BUILD_VECTOR operand type");
15274 }
15275 Ops.push_back(Lane);
15276 }
15277 return DAG.getBuildVector(VT, DL, Ops);
15278}
15279
15281 const AArch64Subtarget *ST) {
15282 EVT VT = Op.getValueType();
15283 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15284 "Expected a legal NEON vector");
15285
15286 APInt DefBits(VT.getSizeInBits(), 0);
15287 APInt UndefBits(VT.getSizeInBits(), 0);
15289 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15290 auto TryMOVIWithBits = [&](APInt DefBits) {
15291 SDValue NewOp;
15292 if ((NewOp =
15293 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15294 (NewOp =
15295 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15296 (NewOp =
15297 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15298 (NewOp =
15299 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15300 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15301 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15302 return NewOp;
15303
15304 APInt NotDefBits = ~DefBits;
15305 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15306 NotDefBits)) ||
15307 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15308 NotDefBits)) ||
15309 (NewOp =
15310 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15311 return NewOp;
15312 return SDValue();
15313 };
15314 if (SDValue R = TryMOVIWithBits(DefBits))
15315 return R;
15316 if (SDValue R = TryMOVIWithBits(UndefBits))
15317 return R;
15318
15319 // See if a fneg of the constant can be materialized with a MOVI, etc
15320 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15321 // FNegate each sub-element of the constant
15322 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15323 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15324 .zext(VT.getSizeInBits());
15325 APInt NegBits(VT.getSizeInBits(), 0);
15326 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15327 for (unsigned i = 0; i < NumElts; i++)
15328 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15329 NegBits = DefBits ^ NegBits;
15330
15331 // Try to create the new constants with MOVI, and if so generate a fneg
15332 // for it.
15333 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15334 SDLoc DL(Op);
15335 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15336 return DAG.getNode(
15337 AArch64ISD::NVCAST, DL, VT,
15338 DAG.getNode(ISD::FNEG, DL, VFVT,
15339 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15340 }
15341 return SDValue();
15342 };
15343 SDValue R;
15344 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15345 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15346 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15347 return R;
15348 }
15349
15350 return SDValue();
15351}
15352
15353SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15354 SDValue Op, SelectionDAG &DAG) const {
15355 EVT VT = Op.getValueType();
15356 SDLoc DL(Op);
15357 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15358 auto *BVN = cast<BuildVectorSDNode>(Op);
15359
15360 if (auto SeqInfo = BVN->isConstantSequence()) {
15361 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15362 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15363 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15364 return convertFromScalableVector(DAG, VT, Seq);
15365 }
15366
15367 unsigned NumElems = VT.getVectorNumElements();
15368 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15369 NumElems <= 1 || BVN->isConstant())
15370 return SDValue();
15371
15372 auto IsExtractElt = [](SDValue Op) {
15373 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15374 };
15375
15376 // For integer types that are not already in vectors limit to at most four
15377 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15378 if (VT.getScalarType().isInteger() &&
15379 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15380 return SDValue();
15381
15382 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15383 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15385 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15386 return Op.isUndef() ? Undef
15387 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15388 ContainerVT, Undef, Op, ZeroI64);
15389 });
15390
15391 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15392 while (Intermediates.size() > 1) {
15393 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15394
15395 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15396 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15397 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15398 Intermediates[I / 2] =
15399 Op1.isUndef() ? Op0
15400 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15401 }
15402
15403 Intermediates.resize(Intermediates.size() / 2);
15404 ZipEC = ZipEC.divideCoefficientBy(2);
15405 }
15406
15407 assert(Intermediates.size() == 1);
15408 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15409 return convertFromScalableVector(DAG, VT, Vec);
15410}
15411
15412SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15413 SelectionDAG &DAG) const {
15414 EVT VT = Op.getValueType();
15415
15416 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15417 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15418 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15419 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15420
15421 // Try to build a simple constant vector.
15422 Op = NormalizeBuildVector(Op, DAG);
15423 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15424 // abort.
15425 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15426 return SDValue();
15427
15428 // Certain vector constants, used to express things like logical NOT and
15429 // arithmetic NEG, are passed through unmodified. This allows special
15430 // patterns for these operations to match, which will lower these constants
15431 // to whatever is proven necessary.
15432 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15433 if (BVN->isConstant()) {
15434 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15435 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15436 APInt Val(BitSize,
15437 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15438 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15439 return Op;
15440 }
15441 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15442 if (Const->isZero() && !Const->isNegative())
15443 return Op;
15444 }
15445
15446 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15447 return V;
15448
15449 // Scan through the operands to find some interesting properties we can
15450 // exploit:
15451 // 1) If only one value is used, we can use a DUP, or
15452 // 2) if only the low element is not undef, we can just insert that, or
15453 // 3) if only one constant value is used (w/ some non-constant lanes),
15454 // we can splat the constant value into the whole vector then fill
15455 // in the non-constant lanes.
15456 // 4) FIXME: If different constant values are used, but we can intelligently
15457 // select the values we'll be overwriting for the non-constant
15458 // lanes such that we can directly materialize the vector
15459 // some other way (MOVI, e.g.), we can be sneaky.
15460 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15461 SDLoc DL(Op);
15462 unsigned NumElts = VT.getVectorNumElements();
15463 bool isOnlyLowElement = true;
15464 bool usesOnlyOneValue = true;
15465 bool usesOnlyOneConstantValue = true;
15466 bool isConstant = true;
15467 bool AllLanesExtractElt = true;
15468 unsigned NumConstantLanes = 0;
15469 unsigned NumDifferentLanes = 0;
15470 unsigned NumUndefLanes = 0;
15471 SDValue Value;
15472 SDValue ConstantValue;
15473 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15474 unsigned ConsecutiveValCount = 0;
15475 SDValue PrevVal;
15476 for (unsigned i = 0; i < NumElts; ++i) {
15477 SDValue V = Op.getOperand(i);
15478 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15479 AllLanesExtractElt = false;
15480 if (V.isUndef()) {
15481 ++NumUndefLanes;
15482 continue;
15483 }
15484 if (i > 0)
15485 isOnlyLowElement = false;
15486 if (!isIntOrFPConstant(V))
15487 isConstant = false;
15488
15489 if (isIntOrFPConstant(V)) {
15490 ++NumConstantLanes;
15491 if (!ConstantValue.getNode())
15492 ConstantValue = V;
15493 else if (ConstantValue != V)
15494 usesOnlyOneConstantValue = false;
15495 }
15496
15497 if (!Value.getNode())
15498 Value = V;
15499 else if (V != Value) {
15500 usesOnlyOneValue = false;
15501 ++NumDifferentLanes;
15502 }
15503
15504 if (PrevVal != V) {
15505 ConsecutiveValCount = 0;
15506 PrevVal = V;
15507 }
15508
15509 // Keep different values and its last consecutive count. For example,
15510 //
15511 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15512 // t24, t24, t24, t24, t24, t24, t24, t24
15513 // t23 = consecutive count 8
15514 // t24 = consecutive count 8
15515 // ------------------------------------------------------------------
15516 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15517 // t24, t24, t24, t24, t24, t24, t24, t24
15518 // t23 = consecutive count 5
15519 // t24 = consecutive count 9
15520 DifferentValueMap[V] = ++ConsecutiveValCount;
15521 }
15522
15523 if (!Value.getNode()) {
15524 LLVM_DEBUG(
15525 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15526 return DAG.getUNDEF(VT);
15527 }
15528
15529 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15530 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15531 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15532 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15533 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15534 "SCALAR_TO_VECTOR node\n");
15535 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15536 }
15537
15538 if (AllLanesExtractElt) {
15539 SDNode *Vector = nullptr;
15540 bool Even = false;
15541 bool Odd = false;
15542 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15543 // the Odd pattern <1,3,5,...>.
15544 for (unsigned i = 0; i < NumElts; ++i) {
15545 SDValue V = Op.getOperand(i);
15546 const SDNode *N = V.getNode();
15547 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15548 Even = false;
15549 Odd = false;
15550 break;
15551 }
15552 SDValue N0 = N->getOperand(0);
15553
15554 // All elements are extracted from the same vector.
15555 if (!Vector) {
15556 Vector = N0.getNode();
15557 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15558 // BUILD_VECTOR.
15559 if (VT.getVectorElementType() !=
15561 break;
15562 } else if (Vector != N0.getNode()) {
15563 Odd = false;
15564 Even = false;
15565 break;
15566 }
15567
15568 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15569 // indices <1,3,5,...>.
15570 uint64_t Val = N->getConstantOperandVal(1);
15571 if (Val == 2 * i) {
15572 Even = true;
15573 continue;
15574 }
15575 if (Val - 1 == 2 * i) {
15576 Odd = true;
15577 continue;
15578 }
15579
15580 // Something does not match: abort.
15581 Odd = false;
15582 Even = false;
15583 break;
15584 }
15585 if (Even || Odd) {
15586 SDValue LHS =
15588 DAG.getConstant(0, DL, MVT::i64));
15589 SDValue RHS =
15591 DAG.getConstant(NumElts, DL, MVT::i64));
15592
15593 if (Even && !Odd)
15594 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15595 if (Odd && !Even)
15596 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15597 }
15598 }
15599
15600 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15601 // i32 and try again.
15602 if (usesOnlyOneValue) {
15603 if (!isConstant) {
15604 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15605 Value.getValueType() != VT) {
15606 LLVM_DEBUG(
15607 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15608 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15609 }
15610
15611 // This is actually a DUPLANExx operation, which keeps everything vectory.
15612
15613 SDValue Lane = Value.getOperand(1);
15614 Value = Value.getOperand(0);
15615 if (Value.getValueSizeInBits() == 64) {
15616 LLVM_DEBUG(
15617 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15618 "widening it\n");
15619 Value = WidenVector(Value, DAG);
15620 }
15621
15622 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15623 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15624 }
15625
15628 EVT EltTy = VT.getVectorElementType();
15629 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15630 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15631 LLVM_DEBUG(
15632 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15633 "BITCASTS, and try again\n");
15634 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15635 for (unsigned i = 0; i < NumElts; ++i)
15636 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15637 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15638 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15639 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15640 Val.dump(););
15641 Val = LowerBUILD_VECTOR(Val, DAG);
15642 if (Val.getNode())
15643 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15644 }
15645 }
15646
15647 // If we need to insert a small number of different non-constant elements and
15648 // the vector width is sufficiently large, prefer using DUP with the common
15649 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15650 // skip the constant lane handling below.
15651 bool PreferDUPAndInsert =
15652 !isConstant && NumDifferentLanes >= 1 &&
15653 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15654 NumDifferentLanes >= NumConstantLanes;
15655
15656 // If there was only one constant value used and for more than one lane,
15657 // start by splatting that value, then replace the non-constant lanes. This
15658 // is better than the default, which will perform a separate initialization
15659 // for each lane.
15660 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15661 // Firstly, try to materialize the splat constant.
15662 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15663 unsigned BitSize = VT.getScalarSizeInBits();
15664 APInt ConstantValueAPInt(1, 0);
15665 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15666 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15667 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15668 !ConstantValueAPInt.isAllOnes()) {
15669 Val = ConstantBuildVector(Val, DAG, Subtarget);
15670 if (!Val)
15671 // Otherwise, materialize the constant and splat it.
15672 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15673 }
15674
15675 // Now insert the non-constant lanes.
15676 for (unsigned i = 0; i < NumElts; ++i) {
15677 SDValue V = Op.getOperand(i);
15678 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15679 if (!isIntOrFPConstant(V) && !V.isUndef())
15680 // Note that type legalization likely mucked about with the VT of the
15681 // source operand, so we may have to convert it here before inserting.
15682 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15683 }
15684 return Val;
15685 }
15686
15687 // This will generate a load from the constant pool.
15688 if (isConstant) {
15689 LLVM_DEBUG(
15690 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15691 "expansion\n");
15692 return SDValue();
15693 }
15694
15695 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15696 // v4i32s. This is really a truncate, which we can construct out of (legal)
15697 // concats and truncate nodes.
15699 return M;
15700
15701 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15702 if (NumElts >= 4) {
15703 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15704 return Shuffle;
15705
15706 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15707 return Shuffle;
15708 }
15709
15710 if (PreferDUPAndInsert) {
15711 // First, build a constant vector with the common element.
15713 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15714 // Next, insert the elements that do not match the common value.
15715 for (unsigned I = 0; I < NumElts; ++I)
15716 if (Op.getOperand(I) != Value)
15717 NewVector =
15718 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15719 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15720
15721 return NewVector;
15722 }
15723
15724 // If vector consists of two different values, try to generate two DUPs and
15725 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15726 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15728 // Check the consecutive count of the value is the half number of vector
15729 // elements. In this case, we can use CONCAT_VECTORS. For example,
15730 //
15731 // canUseVECTOR_CONCAT = true;
15732 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15733 // t24, t24, t24, t24, t24, t24, t24, t24
15734 //
15735 // canUseVECTOR_CONCAT = false;
15736 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15737 // t24, t24, t24, t24, t24, t24, t24, t24
15738 bool canUseVECTOR_CONCAT = true;
15739 for (auto Pair : DifferentValueMap) {
15740 // Check different values have same length which is NumElts / 2.
15741 if (Pair.second != NumElts / 2)
15742 canUseVECTOR_CONCAT = false;
15743 Vals.push_back(Pair.first);
15744 }
15745
15746 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15747 // CONCAT_VECTORs. For example,
15748 //
15749 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15750 // t24, t24, t24, t24, t24, t24, t24, t24
15751 // ==>
15752 // t26: v8i8 = AArch64ISD::DUP t23
15753 // t28: v8i8 = AArch64ISD::DUP t24
15754 // t29: v16i8 = concat_vectors t26, t28
15755 if (canUseVECTOR_CONCAT) {
15756 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15757 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15758 SubVT.getVectorNumElements() >= 2) {
15759 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15760 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15761 SDValue DUP1 =
15762 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15763 SDValue DUP2 =
15764 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15766 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15767 return CONCAT_VECTORS;
15768 }
15769 }
15770
15771 // Let's try to generate VECTOR_SHUFFLE. For example,
15772 //
15773 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15774 // ==>
15775 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15776 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15777 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15778 if (NumElts >= 8) {
15779 SmallVector<int, 16> MaskVec;
15780 // Build mask for VECTOR_SHUFLLE.
15781 SDValue FirstLaneVal = Op.getOperand(0);
15782 for (unsigned i = 0; i < NumElts; ++i) {
15783 SDValue Val = Op.getOperand(i);
15784 if (FirstLaneVal == Val)
15785 MaskVec.push_back(i);
15786 else
15787 MaskVec.push_back(i + NumElts);
15788 }
15789
15790 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15791 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15792 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15793 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15795 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15796 return VECTOR_SHUFFLE;
15797 }
15798 }
15799
15800 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15801 // know the default expansion would otherwise fall back on something even
15802 // worse. For a vector with one or two non-undef values, that's
15803 // scalar_to_vector for the elements followed by a shuffle (provided the
15804 // shuffle is valid for the target) and materialization element by element
15805 // on the stack followed by a load for everything else.
15806 if (!isConstant && !usesOnlyOneValue) {
15807 LLVM_DEBUG(
15808 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15809 "of INSERT_VECTOR_ELT\n");
15810
15811 SDValue Vec = DAG.getUNDEF(VT);
15812 SDValue Op0 = Op.getOperand(0);
15813 unsigned i = 0;
15814
15815 // Use SCALAR_TO_VECTOR for lane zero to
15816 // a) Avoid a RMW dependency on the full vector register, and
15817 // b) Allow the register coalescer to fold away the copy if the
15818 // value is already in an S or D register, and we're forced to emit an
15819 // INSERT_SUBREG that we can't fold anywhere.
15820 //
15821 // We also allow types like i8 and i16 which are illegal scalar but legal
15822 // vector element types. After type-legalization the inserted value is
15823 // extended (i32) and it is safe to cast them to the vector type by ignoring
15824 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15825 if (!Op0.isUndef()) {
15826 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15827 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15828 ++i;
15829 }
15830 LLVM_DEBUG({
15831 if (i < NumElts)
15832 dbgs() << "Creating nodes for the other vector elements:\n";
15833 });
15834 for (; i < NumElts; ++i) {
15835 SDValue V = Op.getOperand(i);
15836 if (V.isUndef())
15837 continue;
15838 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15839 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15840 }
15841 return Vec;
15842 }
15843
15844 LLVM_DEBUG(
15845 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15846 "better alternative\n");
15847 return SDValue();
15848}
15849
15850SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15851 SelectionDAG &DAG) const {
15852 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15853 !Subtarget->isNeonAvailable()))
15854 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15855
15856 assert(Op.getValueType().isScalableVector() &&
15857 isTypeLegal(Op.getValueType()) &&
15858 "Expected legal scalable vector type!");
15859
15860 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15861 unsigned NumOperands = Op->getNumOperands();
15862 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15863 "Unexpected number of operands in CONCAT_VECTORS");
15864
15865 if (NumOperands == 2)
15866 return Op;
15867
15868 // Concat each pair of subvectors and pack into the lower half of the array.
15869 SmallVector<SDValue> ConcatOps(Op->ops());
15870 while (ConcatOps.size() > 1) {
15871 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15872 SDValue V1 = ConcatOps[I];
15873 SDValue V2 = ConcatOps[I + 1];
15874 EVT SubVT = V1.getValueType();
15875 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15876 ConcatOps[I / 2] =
15877 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15878 }
15879 ConcatOps.resize(ConcatOps.size() / 2);
15880 }
15881 return ConcatOps[0];
15882 }
15883
15884 return SDValue();
15885}
15886
15887SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15888 SelectionDAG &DAG) const {
15889 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15890
15891 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15892 !Subtarget->isNeonAvailable()))
15893 return LowerFixedLengthInsertVectorElt(Op, DAG);
15894
15895 EVT VT = Op.getOperand(0).getValueType();
15896
15897 if (VT.getScalarType() == MVT::i1) {
15898 EVT VectorVT = getPromotedVTForPredicate(VT);
15899 SDLoc DL(Op);
15900 SDValue ExtendedVector =
15901 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15902 SDValue ExtendedValue =
15903 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15904 VectorVT.getScalarType().getSizeInBits() < 32
15905 ? MVT::i32
15906 : VectorVT.getScalarType());
15907 ExtendedVector =
15908 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15909 ExtendedValue, Op.getOperand(2));
15910 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15911 }
15912
15913 // Check for non-constant or out of range lane.
15914 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15915 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15916 return SDValue();
15917
15918 return Op;
15919}
15920
15921SDValue
15922AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15923 SelectionDAG &DAG) const {
15924 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15925 EVT VT = Op.getOperand(0).getValueType();
15926
15927 if (VT.getScalarType() == MVT::i1) {
15928 // We can't directly extract from an SVE predicate; extend it first.
15929 // (This isn't the only possible lowering, but it's straightforward.)
15930 EVT VectorVT = getPromotedVTForPredicate(VT);
15931 SDLoc DL(Op);
15932 SDValue Extend =
15933 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15934 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15935 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15936 Extend, Op.getOperand(1));
15937 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15938 }
15939
15940 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15941 return LowerFixedLengthExtractVectorElt(Op, DAG);
15942
15943 // Check for non-constant or out of range lane.
15944 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15945 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15946 return SDValue();
15947
15948 // Insertion/extraction are legal for V128 types.
15949 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15950 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15951 VT == MVT::v8f16 || VT == MVT::v8bf16)
15952 return Op;
15953
15954 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15955 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15956 VT != MVT::v4bf16)
15957 return SDValue();
15958
15959 // For V64 types, we perform extraction by expanding the value
15960 // to a V128 type and perform the extraction on that.
15961 SDLoc DL(Op);
15962 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15963 EVT WideTy = WideVec.getValueType();
15964
15965 EVT ExtrTy = WideTy.getVectorElementType();
15966 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15967 ExtrTy = MVT::i32;
15968
15969 // For extractions, we just return the result directly.
15970 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15971 Op.getOperand(1));
15972}
15973
15974SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15975 SelectionDAG &DAG) const {
15976 EVT VT = Op.getValueType();
15978 "Only cases that extract a fixed length vector are supported!");
15979 EVT InVT = Op.getOperand(0).getValueType();
15980
15981 // If we don't have legal types yet, do nothing
15982 if (!isTypeLegal(InVT))
15983 return SDValue();
15984
15985 if (InVT.is128BitVector()) {
15986 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15987 unsigned Idx = Op.getConstantOperandVal(1);
15988
15989 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15990 if (Idx == 0)
15991 return Op;
15992
15993 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15994 // that directly.
15995 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15996 return Op;
15997 }
15998
15999 if (InVT.isScalableVector() ||
16000 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16001 SDLoc DL(Op);
16002 SDValue Vec = Op.getOperand(0);
16003 SDValue Idx = Op.getOperand(1);
16004
16005 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16006 if (PackedVT != InVT) {
16007 // Pack input into the bottom part of an SVE register and try again.
16008 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16009 DAG.getUNDEF(PackedVT), Vec,
16010 DAG.getVectorIdxConstant(0, DL));
16011 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16012 }
16013
16014 // This will get matched by custom code during ISelDAGToDAG.
16015 if (isNullConstant(Idx))
16016 return Op;
16017
16018 assert(InVT.isScalableVector() && "Unexpected vector type!");
16019 // Move requested subvector to the start of the vector and try again.
16020 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
16021 return convertFromScalableVector(DAG, VT, Splice);
16022 }
16023
16024 return SDValue();
16025}
16026
16027SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16028 SelectionDAG &DAG) const {
16029 assert(Op.getValueType().isScalableVector() &&
16030 "Only expect to lower inserts into scalable vectors!");
16031
16032 EVT InVT = Op.getOperand(1).getValueType();
16033 unsigned Idx = Op.getConstantOperandVal(2);
16034
16035 SDValue Vec0 = Op.getOperand(0);
16036 SDValue Vec1 = Op.getOperand(1);
16037 SDLoc DL(Op);
16038 EVT VT = Op.getValueType();
16039
16040 if (InVT.isScalableVector()) {
16041 if (!isTypeLegal(VT))
16042 return SDValue();
16043
16044 // Break down insert_subvector into simpler parts.
16045 if (VT.getVectorElementType() == MVT::i1) {
16046 unsigned NumElts = VT.getVectorMinNumElements();
16047 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16048
16049 SDValue Lo, Hi;
16050 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16051 DAG.getVectorIdxConstant(0, DL));
16052 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16053 DAG.getVectorIdxConstant(NumElts / 2, DL));
16054 if (Idx < (NumElts / 2))
16055 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16056 DAG.getVectorIdxConstant(Idx, DL));
16057 else
16058 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16059 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16060
16061 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16062 }
16063
16064 // We can select these directly.
16065 if (isTypeLegal(InVT) && Vec0.isUndef())
16066 return Op;
16067
16068 // Ensure the subvector is half the size of the main vector.
16069 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16070 return SDValue();
16071
16072 // Here narrow and wide refers to the vector element types. After "casting"
16073 // both vectors must have the same bit length and so because the subvector
16074 // has fewer elements, those elements need to be bigger.
16075 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16076 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16077
16078 // NOP cast operands to the largest legal vector of the same element count.
16079 if (VT.isFloatingPoint()) {
16080 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16081 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16082 } else {
16083 // Legal integer vectors are already their largest so Vec0 is fine as is.
16084 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16085 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16086 }
16087
16088 // To replace the top/bottom half of vector V with vector SubV we widen the
16089 // preserved half of V, concatenate this to SubV (the order depending on the
16090 // half being replaced) and then narrow the result.
16091 SDValue Narrow;
16092 if (Idx == 0) {
16093 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16094 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16095 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16096 } else {
16097 assert(Idx == InVT.getVectorMinNumElements() &&
16098 "Invalid subvector index!");
16099 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16100 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16101 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16102 }
16103
16104 return getSVESafeBitCast(VT, Narrow, DAG);
16105 }
16106
16107 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16108 // This will be matched by custom code during ISelDAGToDAG.
16109 if (Vec0.isUndef())
16110 return Op;
16111
16112 std::optional<unsigned> PredPattern =
16114 auto PredTy = VT.changeVectorElementType(MVT::i1);
16115 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16116 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16117 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16118 }
16119
16120 return SDValue();
16121}
16122
16123static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16124 if (Op.getOpcode() != AArch64ISD::DUP &&
16125 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16126 Op.getOpcode() != ISD::BUILD_VECTOR)
16127 return false;
16128
16129 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16130 !isAllConstantBuildVector(Op, SplatVal))
16131 return false;
16132
16133 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16134 !isa<ConstantSDNode>(Op->getOperand(0)))
16135 return false;
16136
16137 SplatVal = Op->getConstantOperandVal(0);
16138 if (Op.getValueType().getVectorElementType() != MVT::i64)
16139 SplatVal = (int32_t)SplatVal;
16140
16141 Negated = false;
16142 if (isPowerOf2_64(SplatVal))
16143 return true;
16144
16145 Negated = true;
16146 if (isPowerOf2_64(-SplatVal)) {
16147 SplatVal = -SplatVal;
16148 return true;
16149 }
16150
16151 return false;
16152}
16153
16154SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16155 EVT VT = Op.getValueType();
16156 SDLoc DL(Op);
16157
16158 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16159 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16160
16161 assert(VT.isScalableVector() && "Expected a scalable vector.");
16162
16163 bool Signed = Op.getOpcode() == ISD::SDIV;
16164 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16165
16166 bool Negated;
16167 uint64_t SplatVal;
16168 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
16170 SDValue Res =
16171 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16172 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16173 if (Negated)
16174 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16175
16176 return Res;
16177 }
16178
16179 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16180 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16181
16182 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16183 // operations, and truncate the result.
16184 EVT WidenedVT;
16185 if (VT == MVT::nxv16i8)
16186 WidenedVT = MVT::nxv8i16;
16187 else if (VT == MVT::nxv8i16)
16188 WidenedVT = MVT::nxv4i32;
16189 else
16190 llvm_unreachable("Unexpected Custom DIV operation");
16191
16192 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16193 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16194 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16195 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16196 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16197 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16198 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16199 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16200 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16201 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16202 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16203}
16204
16205bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16206 EVT VT, unsigned DefinedValues) const {
16207 if (!Subtarget->isNeonAvailable())
16208 return false;
16210}
16211
16213 // Currently no fixed length shuffles that require SVE are legal.
16214 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16215 return false;
16216
16217 if (VT.getVectorNumElements() == 4 &&
16218 (VT.is128BitVector() || VT.is64BitVector())) {
16219 unsigned Cost = getPerfectShuffleCost(M);
16220 if (Cost <= 1)
16221 return true;
16222 }
16223
16224 bool DummyBool;
16225 int DummyInt;
16226 unsigned DummyUnsigned;
16227
16228 unsigned EltSize = VT.getScalarSizeInBits();
16229 unsigned NumElts = VT.getVectorNumElements();
16231 isREVMask(M, EltSize, NumElts, 64) ||
16232 isREVMask(M, EltSize, NumElts, 32) ||
16233 isREVMask(M, EltSize, NumElts, 16) ||
16234 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16235 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16236 isTRNMask(M, NumElts, DummyUnsigned) ||
16237 isUZPMask(M, NumElts, DummyUnsigned) ||
16238 isZIPMask(M, NumElts, DummyUnsigned) ||
16239 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16240 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16241 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16242 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16243 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16244}
16245
16247 EVT VT) const {
16248 // Just delegate to the generic legality, clear masks aren't special.
16249 return isShuffleMaskLegal(M, VT);
16250}
16251
16252/// getVShiftImm - Check if this is a valid build_vector for the immediate
16253/// operand of a vector shift operation, where all the elements of the
16254/// build_vector must have the same constant integer value.
16255static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16256 // Ignore bit_converts.
16257 while (Op.getOpcode() == ISD::BITCAST)
16258 Op = Op.getOperand(0);
16260 APInt SplatBits, SplatUndef;
16261 unsigned SplatBitSize;
16262 bool HasAnyUndefs;
16263 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16264 HasAnyUndefs, ElementBits) ||
16265 SplatBitSize > ElementBits)
16266 return false;
16267 Cnt = SplatBits.getSExtValue();
16268 return true;
16269}
16270
16271/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16272/// operand of a vector shift left operation. That value must be in the range:
16273/// 0 <= Value < ElementBits for a left shift; or
16274/// 0 <= Value <= ElementBits for a long left shift.
16275static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16276 assert(VT.isVector() && "vector shift count is not a vector type");
16277 int64_t ElementBits = VT.getScalarSizeInBits();
16278 if (!getVShiftImm(Op, ElementBits, Cnt))
16279 return false;
16280 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16281}
16282
16283/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16284/// operand of a vector shift right operation. The value must be in the range:
16285/// 1 <= Value <= ElementBits for a right shift; or
16286static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16287 assert(VT.isVector() && "vector shift count is not a vector type");
16288 int64_t ElementBits = VT.getScalarSizeInBits();
16289 if (!getVShiftImm(Op, ElementBits, Cnt))
16290 return false;
16291 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16292}
16293
16294SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16295 SelectionDAG &DAG) const {
16296 EVT VT = Op.getValueType();
16297
16298 if (VT.getScalarType() == MVT::i1) {
16299 // Lower i1 truncate to `(x & 1) != 0`.
16300 SDLoc DL(Op);
16301 EVT OpVT = Op.getOperand(0).getValueType();
16302 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16303 SDValue One = DAG.getConstant(1, DL, OpVT);
16304 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16305 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16306 }
16307
16308 if (!VT.isVector() || VT.isScalableVector())
16309 return SDValue();
16310
16311 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16312 !Subtarget->isNeonAvailable()))
16313 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16314
16315 return SDValue();
16316}
16317
16318// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16319// possibly a truncated type, it tells how many bits of the value are to be
16320// used.
16322 SelectionDAG &DAG,
16323 unsigned &ShiftValue,
16324 SDValue &RShOperand) {
16325 if (Shift->getOpcode() != ISD::SRL)
16326 return false;
16327
16328 EVT VT = Shift.getValueType();
16329 assert(VT.isScalableVT());
16330
16331 auto ShiftOp1 =
16333 if (!ShiftOp1)
16334 return false;
16335
16336 ShiftValue = ShiftOp1->getZExtValue();
16337 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16338 return false;
16339
16340 SDValue Add = Shift->getOperand(0);
16341 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16342 return false;
16343
16345 "ResVT must be truncated or same type as the shift.");
16346 // Check if an overflow can lead to incorrect results.
16347 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16348 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16349 return false;
16350
16351 auto AddOp1 =
16353 if (!AddOp1)
16354 return false;
16355 uint64_t AddValue = AddOp1->getZExtValue();
16356 if (AddValue != 1ULL << (ShiftValue - 1))
16357 return false;
16358
16359 RShOperand = Add->getOperand(0);
16360 return true;
16361}
16362
16363SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16364 SelectionDAG &DAG) const {
16365 EVT VT = Op.getValueType();
16366 SDLoc DL(Op);
16367 int64_t Cnt;
16368
16369 if (!Op.getOperand(1).getValueType().isVector())
16370 return Op;
16371 unsigned EltSize = VT.getScalarSizeInBits();
16372
16373 switch (Op.getOpcode()) {
16374 case ISD::SHL:
16375 if (VT.isScalableVector() ||
16376 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16377 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16378
16379 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16380 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16381 DAG.getConstant(Cnt, DL, MVT::i32));
16382 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
16383 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
16384 MVT::i32),
16385 Op.getOperand(0), Op.getOperand(1));
16386 case ISD::SRA:
16387 case ISD::SRL:
16388 if (VT.isScalableVector() &&
16389 (Subtarget->hasSVE2() ||
16390 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16391 SDValue RShOperand;
16392 unsigned ShiftValue;
16393 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16394 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16395 getPredicateForVector(DAG, DL, VT), RShOperand,
16396 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16397 }
16398
16399 if (VT.isScalableVector() ||
16400 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16401 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16402 : AArch64ISD::SRL_PRED;
16403 return LowerToPredicatedOp(Op, DAG, Opc);
16404 }
16405
16406 // Right shift immediate
16407 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16408 unsigned Opc =
16409 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16410 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16411 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
16412 }
16413
16414 // Right shift register. Note, there is not a shift right register
16415 // instruction, but the shift left register instruction takes a signed
16416 // value, where negative numbers specify a right shift.
16417 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16418 : Intrinsic::aarch64_neon_ushl;
16419 // negate the shift amount
16420 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16421 Op.getOperand(1));
16422 SDValue NegShiftLeft =
16424 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16425 NegShift);
16426 return NegShiftLeft;
16427 }
16428
16429 llvm_unreachable("unexpected shift opcode");
16430}
16431
16432SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16433 SelectionDAG &DAG) const {
16434 if (Op.getValueType().isScalableVector())
16435 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16436
16437 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16438 !Subtarget->isNeonAvailable()))
16439 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16440
16441 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16442 SDValue LHS = Op.getOperand(0);
16443 SDValue RHS = Op.getOperand(1);
16444 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16445 SDLoc DL(Op);
16446
16447 if (LHS.getValueType().getVectorElementType().isInteger())
16448 return Op;
16449
16450 assert(((!Subtarget->hasFullFP16() &&
16451 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16452 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16453 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16454 "Unexpected type!");
16455
16456 // Lower isnan(x) | isnan(never-nan) to x != x.
16457 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16458 if (CC == ISD::SETUO || CC == ISD::SETO) {
16459 bool OneNaN = false;
16460 if (LHS == RHS) {
16461 OneNaN = true;
16462 } else if (DAG.isKnownNeverNaN(RHS)) {
16463 OneNaN = true;
16464 RHS = LHS;
16465 } else if (DAG.isKnownNeverNaN(LHS)) {
16466 OneNaN = true;
16467 LHS = RHS;
16468 }
16469 if (OneNaN) {
16470 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16471 }
16472 }
16473
16474 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16475 // clean. Some of them require two branches to implement.
16476 AArch64CC::CondCode CC1, CC2;
16477 bool ShouldInvert;
16478 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16479
16480 bool NoNaNs =
16481 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16482 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16483 if (!Cmp.getNode())
16484 return SDValue();
16485
16486 if (CC2 != AArch64CC::AL) {
16487 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16488 if (!Cmp2.getNode())
16489 return SDValue();
16490
16491 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16492 }
16493
16494 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16495
16496 if (ShouldInvert)
16497 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16498
16499 return Cmp;
16500}
16501
16502static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16503 SelectionDAG &DAG) {
16504 SDValue VecOp = ScalarOp.getOperand(0);
16505 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16506 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16507 DAG.getConstant(0, DL, MVT::i64));
16508}
16509
16510static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16511 SDLoc DL, SelectionDAG &DAG) {
16512 unsigned ScalarOpcode;
16513 switch (Opcode) {
16514 case ISD::VECREDUCE_AND:
16515 ScalarOpcode = ISD::AND;
16516 break;
16517 case ISD::VECREDUCE_OR:
16518 ScalarOpcode = ISD::OR;
16519 break;
16520 case ISD::VECREDUCE_XOR:
16521 ScalarOpcode = ISD::XOR;
16522 break;
16523 default:
16524 llvm_unreachable("Expected bitwise vector reduction");
16525 return SDValue();
16526 }
16527
16528 EVT VecVT = Vec.getValueType();
16529 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16530 "Expected power-of-2 length vector");
16531
16532 EVT ElemVT = VecVT.getVectorElementType();
16533
16534 SDValue Result;
16535 unsigned NumElems = VecVT.getVectorNumElements();
16536
16537 // Special case for boolean reductions
16538 if (ElemVT == MVT::i1) {
16539 // Split large vectors into smaller ones
16540 if (NumElems > 16) {
16541 SDValue Lo, Hi;
16542 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16543 EVT HalfVT = Lo.getValueType();
16544 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16545 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16546 }
16547
16548 // Results of setcc operations get widened to 128 bits if their input
16549 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16550 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16551 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16552 // size leads to the best codegen, since e.g. setcc results might need to be
16553 // truncated otherwise.
16554 unsigned ExtendedWidth = 64;
16555 if (Vec.getOpcode() == ISD::SETCC &&
16556 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16557 ExtendedWidth = 128;
16558 }
16559 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16560
16561 // any_ext doesn't work with umin/umax, so only use it for uadd.
16562 unsigned ExtendOp =
16563 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16564 SDValue Extended = DAG.getNode(
16565 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16566 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16567 // in that case we bitcast the sign extended values from v2i64 to v4i32
16568 // before reduction for optimal code generation.
16569 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16570 NumElems == 2 && ExtendedWidth == 128) {
16571 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16572 ExtendedVT = MVT::i32;
16573 }
16574 switch (ScalarOpcode) {
16575 case ISD::AND:
16576 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16577 break;
16578 case ISD::OR:
16579 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16580 break;
16581 case ISD::XOR:
16582 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16583 break;
16584 default:
16585 llvm_unreachable("Unexpected Opcode");
16586 }
16587
16588 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16589 } else {
16590 // Iteratively split the vector in half and combine using the bitwise
16591 // operation until it fits in a 64 bit register.
16592 while (VecVT.getSizeInBits() > 64) {
16593 SDValue Lo, Hi;
16594 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16595 VecVT = Lo.getValueType();
16596 NumElems = VecVT.getVectorNumElements();
16597 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16598 }
16599
16600 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16601
16602 // Do the remaining work on a scalar since it allows the code generator to
16603 // combine the shift and bitwise operation into one instruction and since
16604 // integer instructions can have higher throughput than vector instructions.
16605 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16606
16607 // Iteratively combine the lower and upper halves of the scalar using the
16608 // bitwise operation, halving the relevant region of the scalar in each
16609 // iteration, until the relevant region is just one element of the original
16610 // vector.
16611 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16612 SDValue ShiftAmount =
16613 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16614 SDValue Shifted =
16615 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16616 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16617 }
16618
16619 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16620 }
16621
16622 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16623}
16624
16625SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16626 SelectionDAG &DAG) const {
16627 SDValue Src = Op.getOperand(0);
16628 EVT SrcVT = Src.getValueType();
16629
16630 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16631 // widening by inserting zeroes.
16632 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16633 SrcVT == MVT::v2f16) {
16634 SDLoc DL(Op);
16635 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16636 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16637 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16638 }
16639
16640 // Try to lower fixed length reductions to SVE.
16641 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16642 Op.getOpcode() == ISD::VECREDUCE_AND ||
16643 Op.getOpcode() == ISD::VECREDUCE_OR ||
16644 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16645 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16646 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16647 SrcVT.getVectorElementType() == MVT::i64);
16648 if (SrcVT.isScalableVector() ||
16650 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16651
16652 if (SrcVT.getVectorElementType() == MVT::i1)
16653 return LowerPredReductionToSVE(Op, DAG);
16654
16655 switch (Op.getOpcode()) {
16656 case ISD::VECREDUCE_ADD:
16657 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16658 case ISD::VECREDUCE_AND:
16659 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16660 case ISD::VECREDUCE_OR:
16661 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16662 case ISD::VECREDUCE_SMAX:
16663 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16664 case ISD::VECREDUCE_SMIN:
16665 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16666 case ISD::VECREDUCE_UMAX:
16667 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16668 case ISD::VECREDUCE_UMIN:
16669 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16670 case ISD::VECREDUCE_XOR:
16671 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16672 case ISD::VECREDUCE_FADD:
16673 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16674 case ISD::VECREDUCE_FMAX:
16675 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16676 case ISD::VECREDUCE_FMIN:
16677 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16678 case ISD::VECREDUCE_FMAXIMUM:
16679 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16680 case ISD::VECREDUCE_FMINIMUM:
16681 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16682 default:
16683 llvm_unreachable("Unhandled fixed length reduction");
16684 }
16685 }
16686
16687 // Lower NEON reductions.
16688 SDLoc DL(Op);
16689 switch (Op.getOpcode()) {
16690 case ISD::VECREDUCE_AND:
16691 case ISD::VECREDUCE_OR:
16692 case ISD::VECREDUCE_XOR:
16693 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16694 Op.getValueType(), DL, DAG);
16695 case ISD::VECREDUCE_ADD:
16696 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16697 case ISD::VECREDUCE_SMAX:
16698 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16699 case ISD::VECREDUCE_SMIN:
16700 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16701 case ISD::VECREDUCE_UMAX:
16702 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16703 case ISD::VECREDUCE_UMIN:
16704 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16705 default:
16706 llvm_unreachable("Unhandled reduction");
16707 }
16708}
16709
16710SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16711 SelectionDAG &DAG) const {
16712 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16713 // No point replacing if we don't have the relevant instruction/libcall anyway
16714 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16715 return SDValue();
16716
16717 // LSE has an atomic load-clear instruction, but not a load-and.
16718 SDLoc DL(Op);
16719 MVT VT = Op.getSimpleValueType();
16720 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16721 SDValue RHS = Op.getOperand(2);
16722 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16723 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16724 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16725 Op.getOperand(0), Op.getOperand(1), RHS,
16726 AN->getMemOperand());
16727}
16728
16729SDValue
16730AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16731 SelectionDAG &DAG) const {
16732
16733 SDLoc DL(Op);
16734 // Get the inputs.
16735 SDNode *Node = Op.getNode();
16736 SDValue Chain = Op.getOperand(0);
16737 SDValue Size = Op.getOperand(1);
16738 MaybeAlign Align =
16739 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16740 EVT VT = Node->getValueType(0);
16741
16743 "no-stack-arg-probe")) {
16744 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16745 Chain = SP.getValue(1);
16746 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16747 if (Align)
16748 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16749 DAG.getSignedConstant(-Align->value(), DL, VT));
16750 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16751 SDValue Ops[2] = {SP, Chain};
16752 return DAG.getMergeValues(Ops, DL);
16753 }
16754
16755 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16756
16757 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16758 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
16759 PtrVT, 0);
16760
16761 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16762 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16763 if (Subtarget->hasCustomCallingConv())
16764 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16765
16766 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16767 DAG.getConstant(4, DL, MVT::i64));
16768 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16769 Chain =
16770 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16771 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16772 DAG.getRegisterMask(Mask), Chain.getValue(1));
16773 // To match the actual intent better, we should read the output from X15 here
16774 // again (instead of potentially spilling it to the stack), but rereading Size
16775 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16776 // here.
16777
16778 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16779 DAG.getConstant(4, DL, MVT::i64));
16780
16781 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16782 Chain = SP.getValue(1);
16783 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16784 if (Align)
16785 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16786 DAG.getSignedConstant(-Align->value(), DL, VT));
16787 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16788
16789 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16790
16791 SDValue Ops[2] = {SP, Chain};
16792 return DAG.getMergeValues(Ops, DL);
16793}
16794
16795SDValue
16796AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16797 SelectionDAG &DAG) const {
16798 // Get the inputs.
16799 SDNode *Node = Op.getNode();
16800 SDValue Chain = Op.getOperand(0);
16801 SDValue Size = Op.getOperand(1);
16802
16803 MaybeAlign Align =
16804 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16805 SDLoc DL(Op);
16806 EVT VT = Node->getValueType(0);
16807
16808 // Construct the new SP value in a GPR.
16809 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16810 Chain = SP.getValue(1);
16811 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16812 if (Align)
16813 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16814 DAG.getSignedConstant(-Align->value(), DL, VT));
16815
16816 // Set the real SP to the new value with a probing loop.
16817 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16818 SDValue Ops[2] = {SP, Chain};
16819 return DAG.getMergeValues(Ops, DL);
16820}
16821
16822SDValue
16823AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16824 SelectionDAG &DAG) const {
16825 MachineFunction &MF = DAG.getMachineFunction();
16826
16827 if (Subtarget->isTargetWindows())
16828 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16829 else if (hasInlineStackProbe(MF))
16830 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16831 else
16832 return SDValue();
16833}
16834
16835SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16836 unsigned NewOp) const {
16837 if (Subtarget->hasSVE2())
16838 return LowerToPredicatedOp(Op, DAG, NewOp);
16839
16840 // Default to expand.
16841 return SDValue();
16842}
16843
16844SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16845 SelectionDAG &DAG) const {
16846 EVT VT = Op.getValueType();
16847 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16848
16849 SDLoc DL(Op);
16850 APInt MulImm = Op.getConstantOperandAPInt(0);
16851 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16852 VT);
16853}
16854
16855/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16856template <unsigned NumVecs>
16857static bool
16861 // Retrieve EC from first vector argument.
16862 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16864#ifndef NDEBUG
16865 // Check the assumption that all input vectors are the same type.
16866 for (unsigned I = 0; I < NumVecs; ++I)
16867 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16868 "Invalid type.");
16869#endif
16870 // memVT is `NumVecs * VT`.
16872 EC * NumVecs);
16873 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16874 Info.offset = 0;
16875 Info.align.reset();
16877 return true;
16878}
16879
16880/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16881/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16882/// specified in the intrinsic calls.
16884 const CallInst &I,
16885 MachineFunction &MF,
16886 unsigned Intrinsic) const {
16887 auto &DL = I.getDataLayout();
16888 switch (Intrinsic) {
16889 case Intrinsic::aarch64_sve_st2:
16890 return setInfoSVEStN<2>(*this, DL, Info, I);
16891 case Intrinsic::aarch64_sve_st3:
16892 return setInfoSVEStN<3>(*this, DL, Info, I);
16893 case Intrinsic::aarch64_sve_st4:
16894 return setInfoSVEStN<4>(*this, DL, Info, I);
16895 case Intrinsic::aarch64_neon_ld2:
16896 case Intrinsic::aarch64_neon_ld3:
16897 case Intrinsic::aarch64_neon_ld4:
16898 case Intrinsic::aarch64_neon_ld1x2:
16899 case Intrinsic::aarch64_neon_ld1x3:
16900 case Intrinsic::aarch64_neon_ld1x4: {
16901 Info.opc = ISD::INTRINSIC_W_CHAIN;
16902 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16903 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16904 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16905 Info.offset = 0;
16906 Info.align.reset();
16907 // volatile loads with NEON intrinsics not supported
16908 Info.flags = MachineMemOperand::MOLoad;
16909 return true;
16910 }
16911 case Intrinsic::aarch64_neon_ld2lane:
16912 case Intrinsic::aarch64_neon_ld3lane:
16913 case Intrinsic::aarch64_neon_ld4lane:
16914 case Intrinsic::aarch64_neon_ld2r:
16915 case Intrinsic::aarch64_neon_ld3r:
16916 case Intrinsic::aarch64_neon_ld4r: {
16917 Info.opc = ISD::INTRINSIC_W_CHAIN;
16918 // ldx return struct with the same vec type
16919 Type *RetTy = I.getType();
16920 auto *StructTy = cast<StructType>(RetTy);
16921 unsigned NumElts = StructTy->getNumElements();
16922 Type *VecTy = StructTy->getElementType(0);
16923 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16924 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16925 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16926 Info.offset = 0;
16927 Info.align.reset();
16928 // volatile loads with NEON intrinsics not supported
16929 Info.flags = MachineMemOperand::MOLoad;
16930 return true;
16931 }
16932 case Intrinsic::aarch64_neon_st2:
16933 case Intrinsic::aarch64_neon_st3:
16934 case Intrinsic::aarch64_neon_st4:
16935 case Intrinsic::aarch64_neon_st1x2:
16936 case Intrinsic::aarch64_neon_st1x3:
16937 case Intrinsic::aarch64_neon_st1x4: {
16938 Info.opc = ISD::INTRINSIC_VOID;
16939 unsigned NumElts = 0;
16940 for (const Value *Arg : I.args()) {
16941 Type *ArgTy = Arg->getType();
16942 if (!ArgTy->isVectorTy())
16943 break;
16944 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16945 }
16946 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16947 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16948 Info.offset = 0;
16949 Info.align.reset();
16950 // volatile stores with NEON intrinsics not supported
16951 Info.flags = MachineMemOperand::MOStore;
16952 return true;
16953 }
16954 case Intrinsic::aarch64_neon_st2lane:
16955 case Intrinsic::aarch64_neon_st3lane:
16956 case Intrinsic::aarch64_neon_st4lane: {
16957 Info.opc = ISD::INTRINSIC_VOID;
16958 unsigned NumElts = 0;
16959 // all the vector type is same
16960 Type *VecTy = I.getArgOperand(0)->getType();
16961 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16962
16963 for (const Value *Arg : I.args()) {
16964 Type *ArgTy = Arg->getType();
16965 if (!ArgTy->isVectorTy())
16966 break;
16967 NumElts += 1;
16968 }
16969
16970 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16971 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16972 Info.offset = 0;
16973 Info.align.reset();
16974 // volatile stores with NEON intrinsics not supported
16975 Info.flags = MachineMemOperand::MOStore;
16976 return true;
16977 }
16978 case Intrinsic::aarch64_ldaxr:
16979 case Intrinsic::aarch64_ldxr: {
16980 Type *ValTy = I.getParamElementType(0);
16981 Info.opc = ISD::INTRINSIC_W_CHAIN;
16982 Info.memVT = MVT::getVT(ValTy);
16983 Info.ptrVal = I.getArgOperand(0);
16984 Info.offset = 0;
16985 Info.align = DL.getABITypeAlign(ValTy);
16987 return true;
16988 }
16989 case Intrinsic::aarch64_stlxr:
16990 case Intrinsic::aarch64_stxr: {
16991 Type *ValTy = I.getParamElementType(1);
16992 Info.opc = ISD::INTRINSIC_W_CHAIN;
16993 Info.memVT = MVT::getVT(ValTy);
16994 Info.ptrVal = I.getArgOperand(1);
16995 Info.offset = 0;
16996 Info.align = DL.getABITypeAlign(ValTy);
16998 return true;
16999 }
17000 case Intrinsic::aarch64_ldaxp:
17001 case Intrinsic::aarch64_ldxp:
17002 Info.opc = ISD::INTRINSIC_W_CHAIN;
17003 Info.memVT = MVT::i128;
17004 Info.ptrVal = I.getArgOperand(0);
17005 Info.offset = 0;
17006 Info.align = Align(16);
17008 return true;
17009 case Intrinsic::aarch64_stlxp:
17010 case Intrinsic::aarch64_stxp:
17011 Info.opc = ISD::INTRINSIC_W_CHAIN;
17012 Info.memVT = MVT::i128;
17013 Info.ptrVal = I.getArgOperand(2);
17014 Info.offset = 0;
17015 Info.align = Align(16);
17017 return true;
17018 case Intrinsic::aarch64_sve_ldnt1: {
17019 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17020 Info.opc = ISD::INTRINSIC_W_CHAIN;
17021 Info.memVT = MVT::getVT(I.getType());
17022 Info.ptrVal = I.getArgOperand(1);
17023 Info.offset = 0;
17024 Info.align = DL.getABITypeAlign(ElTy);
17026 return true;
17027 }
17028 case Intrinsic::aarch64_sve_stnt1: {
17029 Type *ElTy =
17030 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17031 Info.opc = ISD::INTRINSIC_W_CHAIN;
17032 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17033 Info.ptrVal = I.getArgOperand(2);
17034 Info.offset = 0;
17035 Info.align = DL.getABITypeAlign(ElTy);
17037 return true;
17038 }
17039 case Intrinsic::aarch64_mops_memset_tag: {
17040 Value *Dst = I.getArgOperand(0);
17041 Value *Val = I.getArgOperand(1);
17042 Info.opc = ISD::INTRINSIC_W_CHAIN;
17043 Info.memVT = MVT::getVT(Val->getType());
17044 Info.ptrVal = Dst;
17045 Info.offset = 0;
17046 Info.align = I.getParamAlign(0).valueOrOne();
17047 Info.flags = MachineMemOperand::MOStore;
17048 // The size of the memory being operated on is unknown at this point
17049 Info.size = MemoryLocation::UnknownSize;
17050 return true;
17051 }
17052 default:
17053 break;
17054 }
17055
17056 return false;
17057}
17058
17060 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17061 std::optional<unsigned> ByteOffset) const {
17062 // TODO: This may be worth removing. Check regression tests for diffs.
17063 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17064 ByteOffset))
17065 return false;
17066
17067 // If we're reducing the load width in order to avoid having to use an extra
17068 // instruction to do extension then it's probably a good idea.
17069 if (ExtTy != ISD::NON_EXTLOAD)
17070 return true;
17071 // Don't reduce load width if it would prevent us from combining a shift into
17072 // the offset.
17073 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17074 assert(Mem);
17075 const SDValue &Base = Mem->getBasePtr();
17076 if (Base.getOpcode() == ISD::ADD &&
17077 Base.getOperand(1).getOpcode() == ISD::SHL &&
17078 Base.getOperand(1).hasOneUse() &&
17079 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17080 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17081 if (Mem->getMemoryVT().isScalableVector())
17082 return false;
17083 // The shift can be combined if it matches the size of the value being
17084 // loaded (and so reducing the width would make it not match).
17085 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17086 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17087 if (ShiftAmount == Log2_32(LoadBytes))
17088 return false;
17089 }
17090 // We have no reason to disallow reducing the load width, so allow it.
17091 return true;
17092}
17093
17094// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17096 EVT VT = Extend.getValueType();
17097 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17098 SDValue Extract = Extend.getOperand(0);
17099 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17100 Extract = Extract.getOperand(0);
17101 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17102 EVT VecVT = Extract.getOperand(0).getValueType();
17103 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17104 return false;
17105 }
17106 }
17107 return true;
17108}
17109
17110// Truncations from 64-bit GPR to 32-bit GPR is free.
17112 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17113 return false;
17114 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17115 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17116 return NumBits1 > NumBits2;
17117}
17119 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17120 return false;
17121 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17122 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17123 return NumBits1 > NumBits2;
17124}
17125
17126/// Check if it is profitable to hoist instruction in then/else to if.
17127/// Not profitable if I and it's user can form a FMA instruction
17128/// because we prefer FMSUB/FMADD.
17130 if (I->getOpcode() != Instruction::FMul)
17131 return true;
17132
17133 if (!I->hasOneUse())
17134 return true;
17135
17136 Instruction *User = I->user_back();
17137
17138 if (!(User->getOpcode() == Instruction::FSub ||
17139 User->getOpcode() == Instruction::FAdd))
17140 return true;
17141
17143 const Function *F = I->getFunction();
17144 const DataLayout &DL = F->getDataLayout();
17145 Type *Ty = User->getOperand(0)->getType();
17146
17147 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17149 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17150 I->getFastMathFlags().allowContract()));
17151}
17152
17153// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17154// 64-bit GPR.
17156 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17157 return false;
17158 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17159 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17160 return NumBits1 == 32 && NumBits2 == 64;
17161}
17163 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17164 return false;
17165 unsigned NumBits1 = VT1.getSizeInBits();
17166 unsigned NumBits2 = VT2.getSizeInBits();
17167 return NumBits1 == 32 && NumBits2 == 64;
17168}
17169
17171 EVT VT1 = Val.getValueType();
17172 if (isZExtFree(VT1, VT2)) {
17173 return true;
17174 }
17175
17176 if (Val.getOpcode() != ISD::LOAD)
17177 return false;
17178
17179 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17180 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17181 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17182 VT1.getSizeInBits() <= 32);
17183}
17184
17185bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17186 if (isa<FPExtInst>(Ext))
17187 return false;
17188
17189 // Vector types are not free.
17190 if (Ext->getType()->isVectorTy())
17191 return false;
17192
17193 for (const Use &U : Ext->uses()) {
17194 // The extension is free if we can fold it with a left shift in an
17195 // addressing mode or an arithmetic operation: add, sub, and cmp.
17196
17197 // Is there a shift?
17198 const Instruction *Instr = cast<Instruction>(U.getUser());
17199
17200 // Is this a constant shift?
17201 switch (Instr->getOpcode()) {
17202 case Instruction::Shl:
17203 if (!isa<ConstantInt>(Instr->getOperand(1)))
17204 return false;
17205 break;
17206 case Instruction::GetElementPtr: {
17207 gep_type_iterator GTI = gep_type_begin(Instr);
17208 auto &DL = Ext->getDataLayout();
17209 std::advance(GTI, U.getOperandNo()-1);
17210 Type *IdxTy = GTI.getIndexedType();
17211 // This extension will end up with a shift because of the scaling factor.
17212 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17213 // Get the shift amount based on the scaling factor:
17214 // log2(sizeof(IdxTy)) - log2(8).
17215 if (IdxTy->isScalableTy())
17216 return false;
17217 uint64_t ShiftAmt =
17218 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17219 3;
17220 // Is the constant foldable in the shift of the addressing mode?
17221 // I.e., shift amount is between 1 and 4 inclusive.
17222 if (ShiftAmt == 0 || ShiftAmt > 4)
17223 return false;
17224 break;
17225 }
17226 case Instruction::Trunc:
17227 // Check if this is a noop.
17228 // trunc(sext ty1 to ty2) to ty1.
17229 if (Instr->getType() == Ext->getOperand(0)->getType())
17230 continue;
17231 [[fallthrough]];
17232 default:
17233 return false;
17234 }
17235
17236 // At this point we can use the bfm family, so this extension is free
17237 // for that use.
17238 }
17239 return true;
17240}
17241
17242static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17243 unsigned NumElts, bool IsLittleEndian,
17244 SmallVectorImpl<int> &Mask) {
17245 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17246 return false;
17247
17248 assert(DstWidth % SrcWidth == 0 &&
17249 "TBL lowering is not supported for a conversion instruction with this "
17250 "source and destination element type.");
17251
17252 unsigned Factor = DstWidth / SrcWidth;
17253 unsigned MaskLen = NumElts * Factor;
17254
17255 Mask.clear();
17256 Mask.resize(MaskLen, NumElts);
17257
17258 unsigned SrcIndex = 0;
17259 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17260 Mask[I] = SrcIndex++;
17261
17262 return true;
17263}
17264
17266 FixedVectorType *ZExtTy,
17267 FixedVectorType *DstTy,
17268 bool IsLittleEndian) {
17269 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17270 unsigned NumElts = SrcTy->getNumElements();
17271 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17272 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17273
17274 SmallVector<int> Mask;
17275 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17276 return nullptr;
17277
17278 auto *FirstEltZero = Builder.CreateInsertElement(
17279 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17280 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17281 Result = Builder.CreateBitCast(Result, DstTy);
17282 if (DstTy != ZExtTy)
17283 Result = Builder.CreateZExt(Result, ZExtTy);
17284 return Result;
17285}
17286
17288 FixedVectorType *DstTy,
17289 bool IsLittleEndian) {
17290 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17291 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17292 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17293
17294 SmallVector<int> Mask;
17295 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17296 !IsLittleEndian, Mask))
17297 return nullptr;
17298
17299 auto *FirstEltZero = Builder.CreateInsertElement(
17300 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17301
17302 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17303}
17304
17305static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17306 IRBuilder<> Builder(TI);
17308 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17309 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17310 auto *DstTy = cast<FixedVectorType>(TI->getType());
17311 assert(SrcTy->getElementType()->isIntegerTy() &&
17312 "Non-integer type source vector element is not supported");
17313 assert(DstTy->getElementType()->isIntegerTy(8) &&
17314 "Unsupported destination vector element type");
17315 unsigned SrcElemTySz =
17316 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17317 unsigned DstElemTySz =
17318 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17319 assert((SrcElemTySz % DstElemTySz == 0) &&
17320 "Cannot lower truncate to tbl instructions for a source element size "
17321 "that is not divisible by the destination element size");
17322 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17323 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17324 "Unsupported source vector element type size");
17325 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17326
17327 // Create a mask to choose every nth byte from the source vector table of
17328 // bytes to create the truncated destination vector, where 'n' is the truncate
17329 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17330 // 0,8,16,..Y*8th bytes for the little-endian format
17332 for (int Itr = 0; Itr < 16; Itr++) {
17333 if (Itr < NumElements)
17334 MaskConst.push_back(Builder.getInt8(
17335 IsLittleEndian ? Itr * TruncFactor
17336 : Itr * TruncFactor + (TruncFactor - 1)));
17337 else
17338 MaskConst.push_back(Builder.getInt8(255));
17339 }
17340
17341 int MaxTblSz = 128 * 4;
17342 int MaxSrcSz = SrcElemTySz * NumElements;
17343 int ElemsPerTbl =
17344 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17345 assert(ElemsPerTbl <= 16 &&
17346 "Maximum elements selected using TBL instruction cannot exceed 16!");
17347
17348 int ShuffleCount = 128 / SrcElemTySz;
17349 SmallVector<int> ShuffleLanes;
17350 for (int i = 0; i < ShuffleCount; ++i)
17351 ShuffleLanes.push_back(i);
17352
17353 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17354 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17355 // call TBL & save the result in a vector of TBL results for combining later.
17357 while (ShuffleLanes.back() < NumElements) {
17358 Parts.push_back(Builder.CreateBitCast(
17359 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17360
17361 if (Parts.size() == 4) {
17362 Parts.push_back(ConstantVector::get(MaskConst));
17363 Results.push_back(
17364 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17365 Parts.clear();
17366 }
17367
17368 for (int i = 0; i < ShuffleCount; ++i)
17369 ShuffleLanes[i] += ShuffleCount;
17370 }
17371
17372 assert((Parts.empty() || Results.empty()) &&
17373 "Lowering trunc for vectors requiring different TBL instructions is "
17374 "not supported!");
17375 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17376 // registers
17377 if (!Parts.empty()) {
17378 Intrinsic::ID TblID;
17379 switch (Parts.size()) {
17380 case 1:
17381 TblID = Intrinsic::aarch64_neon_tbl1;
17382 break;
17383 case 2:
17384 TblID = Intrinsic::aarch64_neon_tbl2;
17385 break;
17386 case 3:
17387 TblID = Intrinsic::aarch64_neon_tbl3;
17388 break;
17389 }
17390
17391 Parts.push_back(ConstantVector::get(MaskConst));
17392 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17393 }
17394
17395 // Extract the destination vector from TBL result(s) after combining them
17396 // where applicable. Currently, at most two TBLs are supported.
17397 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17398 "more than 2 tbl instructions!");
17399 Value *FinalResult = Results[0];
17400 if (Results.size() == 1) {
17401 if (ElemsPerTbl < 16) {
17402 SmallVector<int> FinalMask(ElemsPerTbl);
17403 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17404 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17405 }
17406 } else {
17407 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17408 if (ElemsPerTbl < 16) {
17409 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17410 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17411 } else {
17412 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17413 }
17414 FinalResult =
17415 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17416 }
17417
17418 TI->replaceAllUsesWith(FinalResult);
17419 TI->eraseFromParent();
17420}
17421
17423 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17424 // shuffle_vector instructions are serialized when targeting SVE,
17425 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17426 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17427 return false;
17428
17429 // Try to optimize conversions using tbl. This requires materializing constant
17430 // index vectors, which can increase code size and add loads. Skip the
17431 // transform unless the conversion is in a loop block guaranteed to execute
17432 // and we are not optimizing for size.
17433 Function *F = I->getParent()->getParent();
17434 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17435 return false;
17436
17437 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17438 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17439 if (!SrcTy || !DstTy)
17440 return false;
17441
17442 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17443 // lowered to tbl instructions to insert the original i8 elements
17444 // into i8x lanes. This is enabled for cases where it is beneficial.
17445 auto *ZExt = dyn_cast<ZExtInst>(I);
17446 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17447 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17448 if (DstWidth % 8 != 0)
17449 return false;
17450
17451 auto *TruncDstType =
17453 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17454 // the remaining ZExt folded into the user, don't use tbl lowering.
17455 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17456 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17459 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17460 return false;
17461
17462 DstTy = TruncDstType;
17463 }
17464
17465 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17466 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17467 // most one extra extend step is needed and using tbl is not profitable.
17468 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17469 // udot instruction.
17470 if (SrcWidth * 4 <= DstWidth) {
17471 if (all_of(I->users(), [&](auto *U) {
17472 auto *SingleUser = cast<Instruction>(&*U);
17473 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17474 return true;
17475 if (match(SingleUser,
17476 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
17477 m_Value(), m_Specific(I))))
17478 return true;
17479 return false;
17480 }))
17481 return false;
17482 }
17483
17484 if (DstTy->getScalarSizeInBits() >= 64)
17485 return false;
17486
17487 IRBuilder<> Builder(ZExt);
17489 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17490 DstTy, Subtarget->isLittleEndian());
17491 if (!Result)
17492 return false;
17493 ZExt->replaceAllUsesWith(Result);
17494 ZExt->eraseFromParent();
17495 return true;
17496 }
17497
17498 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17499 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17500 DstTy->getElementType()->isFloatTy()) ||
17501 (SrcTy->getElementType()->isIntegerTy(16) &&
17502 DstTy->getElementType()->isDoubleTy()))) {
17503 IRBuilder<> Builder(I);
17505 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17506 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17507 assert(ZExt && "Cannot fail for the i8 to float conversion");
17508 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17509 I->replaceAllUsesWith(UI);
17510 I->eraseFromParent();
17511 return true;
17512 }
17513
17514 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17515 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17516 DstTy->getElementType()->isFloatTy()) {
17517 IRBuilder<> Builder(I);
17518 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17520 Subtarget->isLittleEndian());
17521 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17522 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17523 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17524 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17525 I->replaceAllUsesWith(SI);
17526 I->eraseFromParent();
17527 return true;
17528 }
17529
17530 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17531 // followed by a truncate lowered to using tbl.4.
17532 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17533 if (FPToUI &&
17534 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17535 SrcTy->getElementType()->isFloatTy() &&
17536 DstTy->getElementType()->isIntegerTy(8)) {
17537 IRBuilder<> Builder(I);
17538 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17539 VectorType::getInteger(SrcTy));
17540 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17541 I->replaceAllUsesWith(TruncI);
17542 I->eraseFromParent();
17543 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17544 return true;
17545 }
17546
17547 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17548 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17549 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17550 // registers
17551 auto *TI = dyn_cast<TruncInst>(I);
17552 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17553 ((SrcTy->getElementType()->isIntegerTy(32) ||
17554 SrcTy->getElementType()->isIntegerTy(64)) &&
17555 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17556 createTblForTrunc(TI, Subtarget->isLittleEndian());
17557 return true;
17558 }
17559
17560 return false;
17561}
17562
17564 Align &RequiredAlignment) const {
17565 if (!LoadedType.isSimple() ||
17566 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17567 return false;
17568 // Cyclone supports unaligned accesses.
17569 RequiredAlignment = Align(1);
17570 unsigned NumBits = LoadedType.getSizeInBits();
17571 return NumBits == 32 || NumBits == 64;
17572}
17573
17574/// A helper function for determining the number of interleaved accesses we
17575/// will generate when lowering accesses of the given type.
17577 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17578 unsigned VecSize = 128;
17579 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17580 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17581 if (UseScalable && isa<FixedVectorType>(VecTy))
17582 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17583 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17584}
17585
17588 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17589 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17590 return MOStridedAccess;
17592}
17593
17595 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17596 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17597 auto EC = VecTy->getElementCount();
17598 unsigned MinElts = EC.getKnownMinValue();
17599
17600 UseScalable = false;
17601
17602 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17603 (!Subtarget->useSVEForFixedLengthVectors() ||
17605 return false;
17606
17607 if (isa<ScalableVectorType>(VecTy) &&
17608 !Subtarget->isSVEorStreamingSVEAvailable())
17609 return false;
17610
17611 // Ensure the number of vector elements is greater than 1.
17612 if (MinElts < 2)
17613 return false;
17614
17615 // Ensure the element type is legal.
17616 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17617 return false;
17618
17619 if (EC.isScalable()) {
17620 UseScalable = true;
17621 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17622 }
17623
17624 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17625 if (Subtarget->useSVEForFixedLengthVectors()) {
17626 unsigned MinSVEVectorSize =
17627 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17628 if (VecSize % MinSVEVectorSize == 0 ||
17629 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17630 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17631 UseScalable = true;
17632 return true;
17633 }
17634 }
17635
17636 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17637 // 128 will be split into multiple interleaved accesses.
17638 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17639}
17640
17642 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17643 return ScalableVectorType::get(VTy->getElementType(), 2);
17644
17645 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17646 return ScalableVectorType::get(VTy->getElementType(), 4);
17647
17648 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17649 return ScalableVectorType::get(VTy->getElementType(), 8);
17650
17651 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17652 return ScalableVectorType::get(VTy->getElementType(), 8);
17653
17654 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17655 return ScalableVectorType::get(VTy->getElementType(), 2);
17656
17657 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17658 return ScalableVectorType::get(VTy->getElementType(), 4);
17659
17660 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17661 return ScalableVectorType::get(VTy->getElementType(), 8);
17662
17663 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17664 return ScalableVectorType::get(VTy->getElementType(), 16);
17665
17666 llvm_unreachable("Cannot handle input vector type");
17667}
17668
17669static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17670 bool Scalable, Type *LDVTy,
17671 Type *PtrTy) {
17672 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17673 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17674 Intrinsic::aarch64_sve_ld3_sret,
17675 Intrinsic::aarch64_sve_ld4_sret};
17676 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17677 Intrinsic::aarch64_neon_ld3,
17678 Intrinsic::aarch64_neon_ld4};
17679 if (Scalable)
17680 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17681
17682 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17683 {LDVTy, PtrTy});
17684}
17685
17686static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17687 bool Scalable, Type *STVTy,
17688 Type *PtrTy) {
17689 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17690 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17691 Intrinsic::aarch64_sve_st3,
17692 Intrinsic::aarch64_sve_st4};
17693 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17694 Intrinsic::aarch64_neon_st3,
17695 Intrinsic::aarch64_neon_st4};
17696 if (Scalable)
17697 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17698
17699 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17700 {STVTy, PtrTy});
17701}
17702
17703/// Lower an interleaved load into a ldN intrinsic.
17704///
17705/// E.g. Lower an interleaved load (Factor = 2):
17706/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17707/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17708/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17709///
17710/// Into:
17711/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17712/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17713/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17715 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17716 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17717 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17718 "Invalid interleave factor");
17719 assert(!Shuffles.empty() && "Empty shufflevector input");
17720 assert(Shuffles.size() == Indices.size() &&
17721 "Unmatched number of shufflevectors and indices");
17722
17723 auto *LI = dyn_cast<LoadInst>(Load);
17724 if (!LI)
17725 return false;
17726 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17727
17728 const DataLayout &DL = LI->getDataLayout();
17729
17730 VectorType *VTy = Shuffles[0]->getType();
17731
17732 // Skip if we do not have NEON and skip illegal vector types. We can
17733 // "legalize" wide vector types into multiple interleaved accesses as long as
17734 // the vector types are divisible by 128.
17735 bool UseScalable;
17736 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17737 return false;
17738
17739 // Check if the interleave is a zext(shuffle), that can be better optimized
17740 // into shift / and masks. For the moment we do this just for uitofp (not
17741 // zext) to avoid issues with widening instructions.
17742 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17743 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17744 SI->getType()->getScalarSizeInBits() * 4 ==
17745 SI->user_back()->getType()->getScalarSizeInBits();
17746 }))
17747 return false;
17748
17749 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17750
17751 auto *FVTy = cast<FixedVectorType>(VTy);
17752
17753 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17754 // load integer vectors first and then convert to pointer vectors.
17755 Type *EltTy = FVTy->getElementType();
17756 if (EltTy->isPointerTy())
17757 FVTy =
17758 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17759
17760 // If we're going to generate more than one load, reset the sub-vector type
17761 // to something legal.
17762 FVTy = FixedVectorType::get(FVTy->getElementType(),
17763 FVTy->getNumElements() / NumLoads);
17764
17765 auto *LDVTy =
17766 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17767
17768 IRBuilder<> Builder(LI);
17769
17770 // The base address of the load.
17771 Value *BaseAddr = LI->getPointerOperand();
17772
17773 Type *PtrTy = LI->getPointerOperandType();
17774 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17775 LDVTy->getElementCount());
17776
17777 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17778 UseScalable, LDVTy, PtrTy);
17779
17780 // Holds sub-vectors extracted from the load intrinsic return values. The
17781 // sub-vectors are associated with the shufflevector instructions they will
17782 // replace.
17784
17785 Value *PTrue = nullptr;
17786 if (UseScalable) {
17787 std::optional<unsigned> PgPattern =
17788 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17789 if (Subtarget->getMinSVEVectorSizeInBits() ==
17790 Subtarget->getMaxSVEVectorSizeInBits() &&
17791 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17792 PgPattern = AArch64SVEPredPattern::all;
17793
17794 auto *PTruePat =
17795 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17796 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17797 {PTruePat});
17798 }
17799
17800 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17801
17802 // If we're generating more than one load, compute the base address of
17803 // subsequent loads as an offset from the previous.
17804 if (LoadCount > 0)
17805 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17806 FVTy->getNumElements() * Factor);
17807
17808 CallInst *LdN;
17809 if (UseScalable)
17810 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17811 else
17812 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17813
17814 // Extract and store the sub-vectors returned by the load intrinsic.
17815 for (unsigned i = 0; i < Shuffles.size(); i++) {
17816 ShuffleVectorInst *SVI = Shuffles[i];
17817 unsigned Index = Indices[i];
17818
17819 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17820
17821 if (UseScalable)
17822 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17823
17824 // Convert the integer vector to pointer vector if the element is pointer.
17825 if (EltTy->isPointerTy())
17826 SubVec = Builder.CreateIntToPtr(
17828 FVTy->getNumElements()));
17829
17830 SubVecs[SVI].push_back(SubVec);
17831 }
17832 }
17833
17834 // Replace uses of the shufflevector instructions with the sub-vectors
17835 // returned by the load intrinsic. If a shufflevector instruction is
17836 // associated with more than one sub-vector, those sub-vectors will be
17837 // concatenated into a single wide vector.
17838 for (ShuffleVectorInst *SVI : Shuffles) {
17839 auto &SubVec = SubVecs[SVI];
17840 auto *WideVec =
17841 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17842 SVI->replaceAllUsesWith(WideVec);
17843 }
17844
17845 return true;
17846}
17847
17848template <typename Iter>
17849bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17850 int MaxLookupDist = 20;
17851 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17852 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17853 const Value *PtrA1 =
17854 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17855
17856 while (++It != End) {
17857 if (It->isDebugOrPseudoInst())
17858 continue;
17859 if (MaxLookupDist-- == 0)
17860 break;
17861 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17862 const Value *PtrB1 =
17863 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17864 DL, OffsetB);
17865 if (PtrA1 == PtrB1 &&
17866 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17867 .abs() == 16)
17868 return true;
17869 }
17870 }
17871
17872 return false;
17873}
17874
17875/// Lower an interleaved store into a stN intrinsic.
17876///
17877/// E.g. Lower an interleaved store (Factor = 3):
17878/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17879/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17880/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17881///
17882/// Into:
17883/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17884/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17885/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17886/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17887///
17888/// Note that the new shufflevectors will be removed and we'll only generate one
17889/// st3 instruction in CodeGen.
17890///
17891/// Example for a more general valid mask (Factor 3). Lower:
17892/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17893/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17894/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17895///
17896/// Into:
17897/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17898/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17899/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17900/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17902 Value *LaneMask,
17903 ShuffleVectorInst *SVI,
17904 unsigned Factor,
17905 const APInt &GapMask) const {
17906
17907 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17908 "Invalid interleave factor");
17909 auto *SI = dyn_cast<StoreInst>(Store);
17910 if (!SI)
17911 return false;
17912 assert(!LaneMask && GapMask.popcount() == Factor &&
17913 "Unexpected mask on store");
17914
17915 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17916 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17917
17918 unsigned LaneLen = VecTy->getNumElements() / Factor;
17919 Type *EltTy = VecTy->getElementType();
17920 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17921
17922 const DataLayout &DL = SI->getDataLayout();
17923 bool UseScalable;
17924
17925 // Skip if we do not have NEON and skip illegal vector types. We can
17926 // "legalize" wide vector types into multiple interleaved accesses as long as
17927 // the vector types are divisible by 128.
17928 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17929 return false;
17930
17931 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17932
17933 Value *Op0 = SVI->getOperand(0);
17934 Value *Op1 = SVI->getOperand(1);
17935 IRBuilder<> Builder(SI);
17936
17937 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17938 // vectors to integer vectors.
17939 if (EltTy->isPointerTy()) {
17940 Type *IntTy = DL.getIntPtrType(EltTy);
17941 unsigned NumOpElts =
17942 cast<FixedVectorType>(Op0->getType())->getNumElements();
17943
17944 // Convert to the corresponding integer vector.
17945 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17946 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17947 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17948
17949 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17950 }
17951
17952 // If we're going to generate more than one store, reset the lane length
17953 // and sub-vector type to something legal.
17954 LaneLen /= NumStores;
17955 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17956
17957 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17958 : SubVecTy;
17959
17960 // The base address of the store.
17961 Value *BaseAddr = SI->getPointerOperand();
17962
17963 auto Mask = SVI->getShuffleMask();
17964
17965 // Sanity check if all the indices are NOT in range.
17966 // If mask is `poison`, `Mask` may be a vector of -1s.
17967 // If all of them are `poison`, OOB read will happen later.
17968 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17969 return false;
17970 }
17971 // A 64bit st2 which does not start at element 0 will involved adding extra
17972 // ext elements making the st2 unprofitable, and if there is a nearby store
17973 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17974 // zip;ldp pair which has higher throughput.
17975 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17976 (Mask[0] != 0 ||
17977 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17978 DL) ||
17979 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17980 BaseAddr, DL)))
17981 return false;
17982
17983 Type *PtrTy = SI->getPointerOperandType();
17984 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17985 STVTy->getElementCount());
17986
17987 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17988 UseScalable, STVTy, PtrTy);
17989
17990 Value *PTrue = nullptr;
17991 if (UseScalable) {
17992 std::optional<unsigned> PgPattern =
17993 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17994 if (Subtarget->getMinSVEVectorSizeInBits() ==
17995 Subtarget->getMaxSVEVectorSizeInBits() &&
17996 Subtarget->getMinSVEVectorSizeInBits() ==
17997 DL.getTypeSizeInBits(SubVecTy))
17998 PgPattern = AArch64SVEPredPattern::all;
17999
18000 auto *PTruePat =
18001 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18002 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18003 {PTruePat});
18004 }
18005
18006 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18007
18009
18010 // Split the shufflevector operands into sub vectors for the new stN call.
18011 for (unsigned i = 0; i < Factor; i++) {
18012 Value *Shuffle;
18013 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18014 if (Mask[IdxI] >= 0) {
18015 Shuffle = Builder.CreateShuffleVector(
18016 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18017 } else {
18018 unsigned StartMask = 0;
18019 for (unsigned j = 1; j < LaneLen; j++) {
18020 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18021 if (Mask[IdxJ] >= 0) {
18022 StartMask = Mask[IdxJ] - j;
18023 break;
18024 }
18025 }
18026 // Note: Filling undef gaps with random elements is ok, since
18027 // those elements were being written anyway (with undefs).
18028 // In the case of all undefs we're defaulting to using elems from 0
18029 // Note: StartMask cannot be negative, it's checked in
18030 // isReInterleaveMask
18031 Shuffle = Builder.CreateShuffleVector(
18032 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18033 }
18034
18035 if (UseScalable)
18036 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18037 Shuffle, uint64_t(0));
18038
18039 Ops.push_back(Shuffle);
18040 }
18041
18042 if (UseScalable)
18043 Ops.push_back(PTrue);
18044
18045 // If we generating more than one store, we compute the base address of
18046 // subsequent stores as an offset from the previous.
18047 if (StoreCount > 0)
18048 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18049 BaseAddr, LaneLen * Factor);
18050
18051 Ops.push_back(BaseAddr);
18052 Builder.CreateCall(StNFunc, Ops);
18053 }
18054 return true;
18055}
18056
18058 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18059 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18060 if (Factor != 2 && Factor != 4) {
18061 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
18062 return false;
18063 }
18064 auto *LI = dyn_cast<LoadInst>(Load);
18065 if (!LI)
18066 return false;
18067 assert(!Mask && "Unexpected mask on a load\n");
18068
18070
18071 const DataLayout &DL = LI->getModule()->getDataLayout();
18072 bool UseScalable;
18073 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18074 return false;
18075
18076 // TODO: Add support for using SVE instructions with fixed types later, using
18077 // the code from lowerInterleavedLoad to obtain the correct container type.
18078 if (UseScalable && !VTy->isScalableTy())
18079 return false;
18080
18081 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18082 VectorType *LdTy =
18084 VTy->getElementCount().divideCoefficientBy(NumLoads));
18085
18086 Type *PtrTy = LI->getPointerOperandType();
18087 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18088 UseScalable, LdTy, PtrTy);
18089
18090 IRBuilder<> Builder(LI);
18091 Value *Pred = nullptr;
18092 if (UseScalable)
18093 Pred =
18094 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18095
18096 Value *BaseAddr = LI->getPointerOperand();
18097 Value *Result = nullptr;
18098 if (NumLoads > 1) {
18099 // Create multiple legal small ldN.
18100 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18101 for (unsigned I = 0; I < NumLoads; ++I) {
18102 Value *Offset = Builder.getInt64(I * Factor);
18103
18104 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18105 Value *LdN = nullptr;
18106 if (UseScalable)
18107 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18108 else
18109 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18110 Value *Idx =
18111 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18112 for (unsigned J = 0; J < Factor; ++J) {
18113 ExtractedLdValues[J] = Builder.CreateInsertVector(
18114 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18115 }
18116 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18117 }
18118
18119 // Merge the values from different factors.
18120 Result = PoisonValue::get(DI->getType());
18121 for (unsigned J = 0; J < Factor; ++J)
18122 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18123 } else {
18124 if (UseScalable)
18125 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18126 else
18127 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18128 }
18129
18130 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18131 DI->replaceAllUsesWith(Result);
18132 return true;
18133}
18134
18136 Instruction *Store, Value *Mask,
18137 ArrayRef<Value *> InterleavedValues) const {
18138 unsigned Factor = InterleavedValues.size();
18139 if (Factor != 2 && Factor != 4) {
18140 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
18141 return false;
18142 }
18144 if (!SI)
18145 return false;
18146 assert(!Mask && "Unexpected mask on plain store");
18147
18148 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18149 const DataLayout &DL = SI->getModule()->getDataLayout();
18150
18151 bool UseScalable;
18152 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18153 return false;
18154
18155 // TODO: Add support for using SVE instructions with fixed types later, using
18156 // the code from lowerInterleavedStore to obtain the correct container type.
18157 if (UseScalable && !VTy->isScalableTy())
18158 return false;
18159
18160 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18161
18162 VectorType *StTy =
18164 VTy->getElementCount().divideCoefficientBy(NumStores));
18165
18166 Type *PtrTy = SI->getPointerOperandType();
18167 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18168 UseScalable, StTy, PtrTy);
18169
18170 IRBuilder<> Builder(SI);
18171
18172 Value *BaseAddr = SI->getPointerOperand();
18173 Value *Pred = nullptr;
18174
18175 if (UseScalable)
18176 Pred =
18177 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18178
18179 auto ExtractedValues = InterleavedValues;
18180 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18181 if (UseScalable)
18182 StoreOperands.push_back(Pred);
18183 StoreOperands.push_back(BaseAddr);
18184 for (unsigned I = 0; I < NumStores; ++I) {
18185 Value *Address = BaseAddr;
18186 if (NumStores > 1) {
18187 Value *Offset = Builder.getInt64(I * Factor);
18188 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18189 Value *Idx =
18190 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18191 for (unsigned J = 0; J < Factor; J++) {
18192 StoreOperands[J] =
18193 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18194 }
18195 // update the address
18196 StoreOperands[StoreOperands.size() - 1] = Address;
18197 }
18198 Builder.CreateCall(StNFunc, StoreOperands);
18199 }
18200 return true;
18201}
18202
18204 LLVMContext &Context, const MemOp &Op,
18205 const AttributeList &FuncAttributes) const {
18206 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18207 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18208 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18209 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18210 // taken one instruction to materialize the v2i64 zero and one store (with
18211 // restrictive addressing mode). Just do i64 stores.
18212 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18213 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18214 if (Op.isAligned(AlignCheck))
18215 return true;
18216 unsigned Fast;
18217 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18219 Fast;
18220 };
18221
18222 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18223 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18224 return MVT::v16i8;
18225 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18226 return MVT::f128;
18227 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18228 return MVT::i64;
18229 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18230 return MVT::i32;
18231 return MVT::Other;
18232}
18233
18235 const MemOp &Op, const AttributeList &FuncAttributes) const {
18236 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18237 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18238 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18239 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18240 // taken one instruction to materialize the v2i64 zero and one store (with
18241 // restrictive addressing mode). Just do i64 stores.
18242 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18243 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18244 if (Op.isAligned(AlignCheck))
18245 return true;
18246 unsigned Fast;
18247 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18249 Fast;
18250 };
18251
18252 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18253 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18254 return LLT::fixed_vector(2, 64);
18255 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18256 return LLT::scalar(128);
18257 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18258 return LLT::scalar(64);
18259 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18260 return LLT::scalar(32);
18261 return LLT();
18262}
18263
18264// 12-bit optionally shifted immediates are legal for adds.
18266 if (Immed == std::numeric_limits<int64_t>::min()) {
18267 return false;
18268 }
18269 // Same encoding for add/sub, just flip the sign.
18270 return isLegalArithImmed((uint64_t)std::abs(Immed));
18271}
18272
18274 // We will only emit addvl/inc* instructions for SVE2
18275 if (!Subtarget->hasSVE2())
18276 return false;
18277
18278 // addvl's immediates are in terms of the number of bytes in a register.
18279 // Since there are 16 in the base supported size (128bits), we need to
18280 // divide the immediate by that much to give us a useful immediate to
18281 // multiply by vscale. We can't have a remainder as a result of this.
18282 if (Imm % 16 == 0)
18283 return isInt<6>(Imm / 16);
18284
18285 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18286 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18287 // of addvl as a result, so only take h|w|d into account.
18288 // Dec[h|w|d] will cover subtractions.
18289 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18290 // FIXME: Can we make use of other patterns to cover other immediates?
18291
18292 // inch|dech
18293 if (Imm % 8 == 0)
18294 return std::abs(Imm / 8) <= 16;
18295 // incw|decw
18296 if (Imm % 4 == 0)
18297 return std::abs(Imm / 4) <= 16;
18298 // incd|decd
18299 if (Imm % 2 == 0)
18300 return std::abs(Imm / 2) <= 16;
18301
18302 return false;
18303}
18304
18305// Return false to prevent folding
18306// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18307// if the folding leads to worse code.
18309 SDValue AddNode, SDValue ConstNode) const {
18310 // Let the DAGCombiner decide for vector types and large types.
18311 const EVT VT = AddNode.getValueType();
18312 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18313 return true;
18314
18315 // It is worse if c1 is legal add immediate, while c1*c2 is not
18316 // and has to be composed by at least two instructions.
18317 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18318 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18319 const int64_t C1 = C1Node->getSExtValue();
18320 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18322 return true;
18324 // Adapt to the width of a register.
18325 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18326 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18327 if (Insn.size() > 1)
18328 return false;
18329
18330 // Default to true and let the DAGCombiner decide.
18331 return true;
18332}
18333
18334// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18335// immediates is the same as for an add or a sub.
18337 return isLegalAddImmediate(Immed);
18338}
18339
18340/// isLegalAddressingMode - Return true if the addressing mode represented
18341/// by AM is legal for this target, for a load/store of the specified type.
18343 const AddrMode &AMode, Type *Ty,
18344 unsigned AS, Instruction *I) const {
18345 // AArch64 has five basic addressing modes:
18346 // reg
18347 // reg + 9-bit signed offset
18348 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18349 // reg1 + reg2
18350 // reg + SIZE_IN_BYTES * reg
18351
18352 // No global is ever allowed as a base.
18353 if (AMode.BaseGV)
18354 return false;
18355
18356 // No reg+reg+imm addressing.
18357 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18358 return false;
18359
18360 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18361 // `2*ScaledReg` into `BaseReg + ScaledReg`
18362 AddrMode AM = AMode;
18363 if (AM.Scale && !AM.HasBaseReg) {
18364 if (AM.Scale == 1) {
18365 AM.HasBaseReg = true;
18366 AM.Scale = 0;
18367 } else if (AM.Scale == 2) {
18368 AM.HasBaseReg = true;
18369 AM.Scale = 1;
18370 } else {
18371 return false;
18372 }
18373 }
18374
18375 // A base register is required in all addressing modes.
18376 if (!AM.HasBaseReg)
18377 return false;
18378
18379 if (Ty->isScalableTy()) {
18380 if (isa<ScalableVectorType>(Ty)) {
18381 // See if we have a foldable vscale-based offset, for vector types which
18382 // are either legal or smaller than the minimum; more work will be
18383 // required if we need to consider addressing for types which need
18384 // legalization by splitting.
18385 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18386 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18387 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18388 isPowerOf2_64(VecNumBytes))
18389 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18390
18391 uint64_t VecElemNumBytes =
18392 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18393 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18394 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18395 }
18396
18397 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18398 }
18399
18400 // No scalable offsets allowed for non-scalable types.
18401 if (AM.ScalableOffset)
18402 return false;
18403
18404 // check reg + imm case:
18405 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18406 uint64_t NumBytes = 0;
18407 if (Ty->isSized()) {
18408 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18409 NumBytes = NumBits / 8;
18410 if (!isPowerOf2_64(NumBits))
18411 NumBytes = 0;
18412 }
18413
18414 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18415 AM.Scale);
18416}
18417
18418// Check whether the 2 offsets belong to the same imm24 range, and their high
18419// 12bits are same, then their high part can be decoded with the offset of add.
18420int64_t
18422 int64_t MaxOffset) const {
18423 int64_t HighPart = MinOffset & ~0xfffULL;
18424 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18425 // Rebase the value to an integer multiple of imm12.
18426 return HighPart;
18427 }
18428
18429 return 0;
18430}
18431
18433 // Consider splitting large offset of struct or array.
18434 return true;
18435}
18436
18438 const MachineFunction &MF, EVT VT) const {
18439 EVT ScalarVT = VT.getScalarType();
18440
18441 if (!ScalarVT.isSimple())
18442 return false;
18443
18444 switch (ScalarVT.getSimpleVT().SimpleTy) {
18445 case MVT::f16:
18446 return Subtarget->hasFullFP16();
18447 case MVT::f32:
18448 case MVT::f64:
18449 return true;
18450 case MVT::bf16:
18451 return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18452 Subtarget->isNonStreamingSVEorSME2Available();
18453 default:
18454 break;
18455 }
18456
18457 return false;
18458}
18459
18461 Type *Ty) const {
18462 switch (Ty->getScalarType()->getTypeID()) {
18463 case Type::FloatTyID:
18464 case Type::DoubleTyID:
18465 return true;
18466 default:
18467 return false;
18468 }
18469}
18470
18472 EVT VT, CodeGenOptLevel OptLevel) const {
18473 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18475}
18476
18477const MCPhysReg *
18479 // LR is a callee-save register, but we must treat it as clobbered by any call
18480 // site. Hence we include LR in the scratch registers, which are in turn added
18481 // as implicit-defs for stackmaps and patchpoints.
18482 static const MCPhysReg ScratchRegs[] = {
18483 AArch64::X16, AArch64::X17, AArch64::LR, 0
18484 };
18485 return ScratchRegs;
18486}
18487
18489 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18490 return RCRegs;
18491}
18492
18493bool
18495 CombineLevel Level) const {
18496 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18497 N->getOpcode() == ISD::SRL) &&
18498 "Expected shift op");
18499
18500 SDValue ShiftLHS = N->getOperand(0);
18501 EVT VT = N->getValueType(0);
18502
18503 if (!ShiftLHS->hasOneUse())
18504 return false;
18505
18506 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18507 !ShiftLHS.getOperand(0)->hasOneUse())
18508 return false;
18509
18510 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18511 // combine it with shift 'N' to let it be lowered to UBFX except:
18512 // ((x >> C) & mask) << C.
18513 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18514 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18515 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18516 if (isMask_64(TruncMask)) {
18517 SDValue AndLHS = ShiftLHS.getOperand(0);
18518 if (AndLHS.getOpcode() == ISD::SRL) {
18519 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18520 if (N->getOpcode() == ISD::SHL)
18521 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18522 return SRLC->getZExtValue() == SHLC->getZExtValue();
18523 return false;
18524 }
18525 }
18526 }
18527 }
18528 return true;
18529}
18530
18532 const SDNode *N) const {
18533 assert(N->getOpcode() == ISD::XOR &&
18534 (N->getOperand(0).getOpcode() == ISD::SHL ||
18535 N->getOperand(0).getOpcode() == ISD::SRL) &&
18536 "Expected XOR(SHIFT) pattern");
18537
18538 // Only commute if the entire NOT mask is a hidden shifted mask.
18539 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18540 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18541 if (XorC && ShiftC) {
18542 unsigned MaskIdx, MaskLen;
18543 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18544 unsigned ShiftAmt = ShiftC->getZExtValue();
18545 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18546 if (N->getOperand(0).getOpcode() == ISD::SHL)
18547 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18548 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18549 }
18550 }
18551
18552 return false;
18553}
18554
18556 const SDNode *N, CombineLevel Level) const {
18557 assert(((N->getOpcode() == ISD::SHL &&
18558 N->getOperand(0).getOpcode() == ISD::SRL) ||
18559 (N->getOpcode() == ISD::SRL &&
18560 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18561 "Expected shift-shift mask");
18562 // Don't allow multiuse shift folding with the same shift amount.
18563 if (!N->getOperand(0)->hasOneUse())
18564 return false;
18565
18566 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18567 EVT VT = N->getValueType(0);
18568 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18569 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18570 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18571 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18572 }
18573
18574 // We do not need to fold when this shifting used in specific load case:
18575 // (ldr x, (add x, (shl (srl x, c1) 2)))
18576 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18577 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18578 unsigned ShlAmt = C2->getZExtValue();
18579 if (auto ShouldADD = *N->user_begin();
18580 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18581 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18582 EVT MemVT = Load->getMemoryVT();
18583
18584 if (Load->getValueType(0).isScalableVector())
18585 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18586
18587 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18588 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18589 }
18590 }
18591 }
18592 }
18593
18594 return true;
18595}
18596
18598 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18599 SDValue Y) const {
18600 return VT.isScalableVector() && isTypeLegal(VT) &&
18601 SelectOpcode == ISD::VSELECT;
18602}
18603
18605 Type *Ty) const {
18606 assert(Ty->isIntegerTy());
18607
18608 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18609 if (BitSize == 0)
18610 return false;
18611
18612 int64_t Val = Imm.getSExtValue();
18613 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18614 return true;
18615
18616 if (Val < 0)
18617 Val = ~Val;
18618 if (BitSize == 32)
18619 Val &= (1LL << 32) - 1;
18620
18621 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18622 // MOVZ is free so return true for one or fewer MOVK.
18623 return Shift < 3;
18624}
18625
18627 unsigned Index) const {
18629 return false;
18630
18631 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18632}
18633
18634/// Turn vector tests of the signbit in the form of:
18635/// xor (sra X, elt_size(X)-1), -1
18636/// into:
18637/// cmge X, X, #0
18639 const AArch64Subtarget *Subtarget) {
18640 EVT VT = N->getValueType(0);
18641 if (!Subtarget->hasNEON() || !VT.isVector())
18642 return SDValue();
18643
18644 // There must be a shift right algebraic before the xor, and the xor must be a
18645 // 'not' operation.
18646 SDValue Shift = N->getOperand(0);
18647 SDValue Ones = N->getOperand(1);
18648 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18650 return SDValue();
18651
18652 // The shift should be smearing the sign bit across each vector element.
18653 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18654 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18655 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18656 return SDValue();
18657
18658 SDLoc DL(N);
18659 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18660 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18661}
18662
18663// Given a vecreduce_add node, detect the below pattern and convert it to the
18664// node sequence with UABDL, [S|U]ADB and UADDLP.
18665//
18666// i32 vecreduce_add(
18667// v16i32 abs(
18668// v16i32 sub(
18669// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18670//
18671// or
18672//
18673// i32 vecreduce_add(
18674// v16i32 zext(
18675// v16i16 abs(
18676// v16i16 sub(
18677// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18678//
18679// =================>
18680// i32 vecreduce_add(
18681// v4i32 UADDLP(
18682// v8i16 add(
18683// v8i16 zext(
18684// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18685// v8i16 zext(
18686// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18688 SelectionDAG &DAG) {
18689 // Assumed i32 vecreduce_add
18690 if (N->getValueType(0) != MVT::i32)
18691 return SDValue();
18692
18693 SDValue VecReduceOp0 = N->getOperand(0);
18694 bool SawTrailingZext = false;
18695 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18696 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18697 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18698 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18699 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18700 SawTrailingZext = true;
18701 VecReduceOp0 = VecReduceOp0.getOperand(0);
18702 }
18703
18704 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18705 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18706 // Assumed v16i16 or v16i32 abs input
18707 unsigned Opcode = VecReduceOp0.getOpcode();
18708 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18709 return SDValue();
18710
18711 SDValue ABS = VecReduceOp0;
18712 // Assumed v16i16 or v16i32 sub
18713 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18714 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18715 return SDValue();
18716
18717 SDValue SUB = ABS->getOperand(0);
18718 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18719 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18720 // Assumed v16i16 or v16i32 type
18721 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18722 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18723 return SDValue();
18724
18725 // Assumed zext or sext
18726 bool IsZExt = false;
18727 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18728 IsZExt = true;
18729 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18730 IsZExt = false;
18731 } else
18732 return SDValue();
18733
18734 SDValue EXT0 = SUB->getOperand(0);
18735 SDValue EXT1 = SUB->getOperand(1);
18736 // Assumed zext's operand has v16i8 type
18737 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18738 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18739 return SDValue();
18740
18741 // Pattern is detected. Let's convert it to sequence of nodes.
18742 SDLoc DL(N);
18743
18744 // First, create the node pattern of UABD/SABD.
18745 SDValue UABDHigh8Op0 =
18746 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18747 DAG.getConstant(8, DL, MVT::i64));
18748 SDValue UABDHigh8Op1 =
18749 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18750 DAG.getConstant(8, DL, MVT::i64));
18751 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18752 UABDHigh8Op0, UABDHigh8Op1);
18753 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18754
18755 // Second, create the node pattern of UABAL.
18756 SDValue UABDLo8Op0 =
18757 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18758 DAG.getConstant(0, DL, MVT::i64));
18759 SDValue UABDLo8Op1 =
18760 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18761 DAG.getConstant(0, DL, MVT::i64));
18762 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18763 UABDLo8Op0, UABDLo8Op1);
18764 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18765 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18766
18767 // Third, create the node of UADDLP.
18768 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18769
18770 // Fourth, create the node of VECREDUCE_ADD.
18771 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18772}
18773
18774static SDValue
18776 const AArch64Subtarget *ST) {
18777 if (DCI.isBeforeLegalize())
18778 return SDValue();
18779
18780 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18781 /*IsEqual=*/false))
18782 return While;
18783
18784 if (!N->getValueType(0).isScalableVector() ||
18785 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18786 return SDValue();
18787
18788 unsigned NumUses = N->use_size();
18789 auto MaskEC = N->getValueType(0).getVectorElementCount();
18790 if (!MaskEC.isKnownMultipleOf(NumUses))
18791 return SDValue();
18792
18793 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumUses);
18794 if (ExtMinEC.getKnownMinValue() < 2)
18795 return SDValue();
18796
18797 SmallVector<SDNode *> Extracts(NumUses, nullptr);
18798 for (SDNode *Use : N->users()) {
18799 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18800 return SDValue();
18801
18802 // Ensure the extract type is correct (e.g. if NumUses is 4 and
18803 // the mask return type is nxv8i1, each extract should be nxv2i1.
18804 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18805 return SDValue();
18806
18807 // There should be exactly one extract for each part of the mask.
18808 unsigned Offset = Use->getConstantOperandVal(1);
18809 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18810 if (Extracts[Part] != nullptr)
18811 return SDValue();
18812
18813 Extracts[Part] = Use;
18814 }
18815
18816 SelectionDAG &DAG = DCI.DAG;
18817 SDLoc DL(N);
18818 SDValue ID =
18819 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18820
18821 SDValue Idx = N->getOperand(0);
18822 SDValue TC = N->getOperand(1);
18823 EVT OpVT = Idx.getValueType();
18824 if (OpVT != MVT::i64) {
18825 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18826 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18827 }
18828
18829 // Create the whilelo_x2 intrinsics from each pair of extracts
18830 EVT ExtVT = Extracts[0]->getValueType(0);
18831 auto R =
18832 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18833 DCI.CombineTo(Extracts[0], R.getValue(0));
18834 DCI.CombineTo(Extracts[1], R.getValue(1));
18835
18836 if (NumUses == 2)
18837 return SDValue(N, 0);
18838
18839 auto Elts = DAG.getElementCount(DL, OpVT, ExtVT.getVectorElementCount() * 2);
18840 for (unsigned I = 2; I < NumUses; I += 2) {
18841 // After the first whilelo_x2, we need to increment the starting value.
18842 Idx = DAG.getNode(ISD::UADDSAT, DL, OpVT, Idx, Elts);
18843 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18844 DCI.CombineTo(Extracts[I], R.getValue(0));
18845 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18846 }
18847
18848 return SDValue(N, 0);
18849}
18850
18851// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18852// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18853// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18854// If we have vectors larger than v16i8 we extract v16i8 vectors,
18855// Follow the same steps above to get DOT instructions concatenate them
18856// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18858 const AArch64Subtarget *ST) {
18859 if (!ST->isNeonAvailable())
18860 return SDValue();
18861
18862 if (!ST->hasDotProd())
18864
18865 SDValue Op0 = N->getOperand(0);
18866 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18867 Op0.getValueType().getVectorElementType() != MVT::i32)
18868 return SDValue();
18869
18870 unsigned ExtOpcode = Op0.getOpcode();
18871 SDValue A = Op0;
18872 SDValue B;
18873 unsigned DotOpcode;
18874 if (ExtOpcode == ISD::MUL) {
18875 A = Op0.getOperand(0);
18876 B = Op0.getOperand(1);
18877 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18878 return SDValue();
18879 auto OpCodeA = A.getOpcode();
18880 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18881 return SDValue();
18882
18883 auto OpCodeB = B.getOpcode();
18884 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18885 return SDValue();
18886
18887 if (OpCodeA == OpCodeB) {
18888 DotOpcode =
18889 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18890 } else {
18891 // Check USDOT support support
18892 if (!ST->hasMatMulInt8())
18893 return SDValue();
18894 DotOpcode = AArch64ISD::USDOT;
18895 if (OpCodeA == ISD::SIGN_EXTEND)
18896 std::swap(A, B);
18897 }
18898 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18899 DotOpcode = AArch64ISD::UDOT;
18900 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18901 DotOpcode = AArch64ISD::SDOT;
18902 } else {
18903 return SDValue();
18904 }
18905
18906 EVT Op0VT = A.getOperand(0).getValueType();
18907 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18908 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18909 if (!IsValidElementCount || !IsValidSize)
18910 return SDValue();
18911
18912 SDLoc DL(Op0);
18913 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18914 // the extend B.
18915 if (!B)
18916 B = DAG.getConstant(1, DL, Op0VT);
18917 else
18918 B = B.getOperand(0);
18919
18920 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18921 unsigned NumOfVecReduce;
18922 EVT TargetType;
18923 if (IsMultipleOf16) {
18924 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18925 TargetType = MVT::v4i32;
18926 } else {
18927 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18928 TargetType = MVT::v2i32;
18929 }
18930 // Handle the case where we need to generate only one Dot operation.
18931 if (NumOfVecReduce == 1) {
18932 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18933 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18934 A.getOperand(0), B);
18935 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18936 }
18937 // Generate Dot instructions that are multiple of 16.
18938 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18939 SmallVector<SDValue, 4> SDotVec16;
18940 unsigned I = 0;
18941 for (; I < VecReduce16Num; I += 1) {
18942 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18943 SDValue Op0 =
18944 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18945 DAG.getConstant(I * 16, DL, MVT::i64));
18946 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18947 DAG.getConstant(I * 16, DL, MVT::i64));
18948 SDValue Dot =
18949 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18950 SDotVec16.push_back(Dot);
18951 }
18952 // Concatenate dot operations.
18953 EVT SDot16EVT =
18954 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18955 SDValue ConcatSDot16 =
18956 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18957 SDValue VecReduceAdd16 =
18958 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18959 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18960 if (VecReduce8Num == 0)
18961 return VecReduceAdd16;
18962
18963 // Generate the remainder Dot operation that is multiple of 8.
18964 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18965 SDValue Vec8Op0 =
18966 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18967 DAG.getConstant(I * 16, DL, MVT::i64));
18968 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18969 DAG.getConstant(I * 16, DL, MVT::i64));
18970 SDValue Dot =
18971 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18972 SDValue VecReduceAdd8 =
18973 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18974 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18975 VecReduceAdd8);
18976}
18977
18978// Given an (integer) vecreduce, we know the order of the inputs does not
18979// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18980// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18981// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18983 auto DetectAddExtract = [&](SDValue A) {
18984 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18985 // UADDLP(x) if found.
18986 assert(A.getOpcode() == ISD::ADD);
18987 EVT VT = A.getValueType();
18988 SDValue Op0 = A.getOperand(0);
18989 SDValue Op1 = A.getOperand(1);
18990 if (Op0.getOpcode() != Op1.getOpcode() ||
18991 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18992 Op0.getOpcode() != ISD::SIGN_EXTEND))
18993 return SDValue();
18994 SDValue Ext0 = Op0.getOperand(0);
18995 SDValue Ext1 = Op1.getOperand(0);
18996 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18998 Ext0.getOperand(0) != Ext1.getOperand(0))
18999 return SDValue();
19000 // Check that the type is twice the add types, and the extract are from
19001 // upper/lower parts of the same source.
19003 VT.getVectorNumElements() * 2)
19004 return SDValue();
19005 if ((Ext0.getConstantOperandVal(1) != 0 ||
19007 (Ext1.getConstantOperandVal(1) != 0 ||
19009 return SDValue();
19010 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19011 : AArch64ISD::SADDLP;
19012 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19013 };
19014
19015 if (SDValue R = DetectAddExtract(A))
19016 return R;
19017
19018 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19019 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19020 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19021 A.getOperand(1));
19022 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19023 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19024 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19025 A.getOperand(0));
19026 return SDValue();
19027}
19028
19029// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19030// UADDLV(concat), where the concat represents the 64-bit zext sources.
19032 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19033 // UADDLV(concat(zext, zext)) if found.
19034 assert(A.getOpcode() == ISD::ADD);
19035 EVT VT = A.getValueType();
19036 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19037 return SDValue();
19038 SDValue Op0 = A.getOperand(0);
19039 SDValue Op1 = A.getOperand(1);
19040 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19041 return SDValue();
19042 SDValue Ext0 = Op0.getOperand(0);
19043 SDValue Ext1 = Op1.getOperand(0);
19044 EVT ExtVT0 = Ext0.getValueType();
19045 EVT ExtVT1 = Ext1.getValueType();
19046 // Check zext VTs are the same and 64-bit length.
19047 if (ExtVT0 != ExtVT1 ||
19048 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19049 return SDValue();
19050 // Get VT for concat of zext sources.
19051 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19052 SDValue Concat =
19053 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19054
19055 switch (VT.getSimpleVT().SimpleTy) {
19056 case MVT::v2i64:
19057 case MVT::v4i32:
19058 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19059 case MVT::v8i16: {
19060 SDValue Uaddlv =
19061 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19062 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19063 }
19064 default:
19065 llvm_unreachable("Unhandled vector type");
19066 }
19067}
19068
19070 SDValue A = N->getOperand(0);
19071 if (A.getOpcode() == ISD::ADD) {
19072 if (SDValue R = performUADDVAddCombine(A, DAG))
19073 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19074 else if (SDValue R = performUADDVZextCombine(A, DAG))
19075 return R;
19076 }
19077
19078 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19079 MVT OpVT = A.getSimpleValueType();
19080 assert(N->getSimpleValueType(0) == OpVT &&
19081 "The operand type should be consistent with the result type of UADDV");
19083 Mask.clearBit(0);
19084 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19085 if (KnownLeadingLanes.isZero())
19086 return A;
19087
19088 return SDValue();
19089}
19090
19093 const AArch64Subtarget *Subtarget) {
19094 if (DCI.isBeforeLegalizeOps())
19095 return SDValue();
19096
19097 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19098}
19099
19100SDValue
19101AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19102 SelectionDAG &DAG,
19103 SmallVectorImpl<SDNode *> &Created) const {
19104 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19105 if (isIntDivCheap(N->getValueType(0), Attr))
19106 return SDValue(N, 0); // Lower SDIV as SDIV
19107
19108 EVT VT = N->getValueType(0);
19109
19110 // If SVE is available, we can generate
19111 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19112 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19113 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19114 return SDValue(N, 0);
19115
19116 // fold (sdiv X, pow2)
19117 if ((VT != MVT::i32 && VT != MVT::i64) ||
19118 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19119 return SDValue();
19120
19121 // If the divisor is 2 or -2, the default expansion is better. It will add
19122 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19123 if (Divisor == 2 ||
19124 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19125 return SDValue();
19126
19127 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19128}
19129
19130SDValue
19131AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19132 SelectionDAG &DAG,
19133 SmallVectorImpl<SDNode *> &Created) const {
19134 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19135 if (isIntDivCheap(N->getValueType(0), Attr))
19136 return SDValue(N, 0); // Lower SREM as SREM
19137
19138 EVT VT = N->getValueType(0);
19139
19140 // For scalable and fixed types, mark them as cheap so we can handle it much
19141 // later. This allows us to handle larger than legal types.
19142 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19143 return SDValue(N, 0);
19144
19145 // fold (srem X, pow2)
19146 if ((VT != MVT::i32 && VT != MVT::i64) ||
19147 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19148 return SDValue();
19149
19150 unsigned Lg2 = Divisor.countr_zero();
19151 if (Lg2 == 0)
19152 return SDValue();
19153
19154 SDLoc DL(N);
19155 SDValue N0 = N->getOperand(0);
19156 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19157 SDValue Zero = DAG.getConstant(0, DL, VT);
19158 SDValue CCVal, CSNeg;
19159 if (Lg2 == 1) {
19160 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19161 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19162 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19163
19164 Created.push_back(Cmp.getNode());
19165 Created.push_back(And.getNode());
19166 } else {
19167 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19168 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19169
19170 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19171 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19172 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19173 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19174 Negs.getValue(1));
19175
19176 Created.push_back(Negs.getNode());
19177 Created.push_back(AndPos.getNode());
19178 Created.push_back(AndNeg.getNode());
19179 }
19180
19181 return CSNeg;
19182}
19183
19184static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19185 switch(getIntrinsicID(S.getNode())) {
19186 default:
19187 break;
19188 case Intrinsic::aarch64_sve_cntb:
19189 return 8;
19190 case Intrinsic::aarch64_sve_cnth:
19191 return 16;
19192 case Intrinsic::aarch64_sve_cntw:
19193 return 32;
19194 case Intrinsic::aarch64_sve_cntd:
19195 return 64;
19196 }
19197 return {};
19198}
19199
19200/// Calculates what the pre-extend type is, based on the extension
19201/// operation node provided by \p Extend.
19202///
19203/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19204/// pre-extend type is pulled directly from the operand, while other extend
19205/// operations need a bit more inspection to get this information.
19206///
19207/// \param Extend The SDNode from the DAG that represents the extend operation
19208///
19209/// \returns The type representing the \p Extend source type, or \p MVT::Other
19210/// if no valid type can be determined
19212 switch (Extend.getOpcode()) {
19213 case ISD::SIGN_EXTEND:
19214 case ISD::ZERO_EXTEND:
19215 case ISD::ANY_EXTEND:
19216 return Extend.getOperand(0).getValueType();
19217 case ISD::AssertSext:
19218 case ISD::AssertZext:
19220 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19221 if (!TypeNode)
19222 return MVT::Other;
19223 return TypeNode->getVT();
19224 }
19225 case ISD::AND: {
19228 if (!Constant)
19229 return MVT::Other;
19230
19231 uint32_t Mask = Constant->getZExtValue();
19232
19233 if (Mask == UCHAR_MAX)
19234 return MVT::i8;
19235 else if (Mask == USHRT_MAX)
19236 return MVT::i16;
19237 else if (Mask == UINT_MAX)
19238 return MVT::i32;
19239
19240 return MVT::Other;
19241 }
19242 default:
19243 return MVT::Other;
19244 }
19245}
19246
19247/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19248/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19249/// SExt/ZExt rather than the scalar SExt/ZExt
19251 EVT VT = BV.getValueType();
19252 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19254 return SDValue();
19255
19256 // Use the first item in the buildvector/shuffle to get the size of the
19257 // extend, and make sure it looks valid.
19258 SDValue Extend = BV->getOperand(0);
19259 unsigned ExtendOpcode = Extend.getOpcode();
19260 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19261 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19262 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19263 ExtendOpcode == ISD::AssertSext;
19264 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19265 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19266 return SDValue();
19267 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19268 // ensure calculatePreExtendType will work without issue.
19269 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19270 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19271 return SDValue();
19272
19273 // Restrict valid pre-extend data type
19274 EVT PreExtendType = calculatePreExtendType(Extend);
19275 if (PreExtendType == MVT::Other ||
19276 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19277 return SDValue();
19278
19279 // Make sure all other operands are equally extended.
19280 bool SeenZExtOrSExt = !IsAnyExt;
19281 for (SDValue Op : drop_begin(BV->ops())) {
19282 if (Op.isUndef())
19283 continue;
19284
19285 if (calculatePreExtendType(Op) != PreExtendType)
19286 return SDValue();
19287
19288 unsigned Opc = Op.getOpcode();
19289 if (Opc == ISD::ANY_EXTEND)
19290 continue;
19291
19292 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19294
19295 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19296 return SDValue();
19297
19298 IsSExt = OpcIsSExt;
19299 SeenZExtOrSExt = true;
19300 }
19301
19302 SDValue NBV;
19303 SDLoc DL(BV);
19304 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19305 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19306 EVT PreExtendLegalType =
19307 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19309 for (SDValue Op : BV->ops())
19310 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19311 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19312 PreExtendLegalType));
19313 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19314 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19315 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19316 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19317 BV.getOperand(1).isUndef()
19318 ? DAG.getUNDEF(PreExtendVT)
19319 : BV.getOperand(1).getOperand(0),
19320 cast<ShuffleVectorSDNode>(BV)->getMask());
19321 }
19322 unsigned ExtOpc = !SeenZExtOrSExt
19324 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19325 return DAG.getNode(ExtOpc, DL, VT, NBV);
19326}
19327
19328/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19329/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19331 // If the value type isn't a vector, none of the operands are going to be dups
19332 EVT VT = Mul->getValueType(0);
19333 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19334 return SDValue();
19335
19336 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19337 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19338
19339 // Neither operands have been changed, don't make any further changes
19340 if (!Op0 && !Op1)
19341 return SDValue();
19342
19343 SDLoc DL(Mul);
19344 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19345 Op1 ? Op1 : Mul->getOperand(1));
19346}
19347
19348// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19349// Same for other types with equivalent constants.
19351 EVT VT = N->getValueType(0);
19352 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19353 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19354 return SDValue();
19355 if (N->getOperand(0).getOpcode() != ISD::AND ||
19356 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19357 return SDValue();
19358
19359 SDValue And = N->getOperand(0);
19360 SDValue Srl = And.getOperand(0);
19361
19362 APInt V1, V2, V3;
19363 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19364 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19366 return SDValue();
19367
19368 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19369 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19370 V3 != (HalfSize - 1))
19371 return SDValue();
19372
19373 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19374 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19375 VT.getVectorElementCount() * 2);
19376
19377 SDLoc DL(N);
19378 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19379 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19380 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19381 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19382}
19383
19384// Transform vector add(zext i8 to i32, zext i8 to i32)
19385// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19386// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19387// extends.
19389 EVT VT = N->getValueType(0);
19390 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19391 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19392 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19393 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19394 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19395 N->getOperand(0).getOperand(0).getValueType() !=
19396 N->getOperand(1).getOperand(0).getValueType())
19397 return SDValue();
19398
19399 if (N->getOpcode() == ISD::MUL &&
19400 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19401 return SDValue();
19402
19403 SDValue N0 = N->getOperand(0).getOperand(0);
19404 SDValue N1 = N->getOperand(1).getOperand(0);
19405 EVT InVT = N0.getValueType();
19406
19407 EVT S1 = InVT.getScalarType();
19408 EVT S2 = VT.getScalarType();
19409 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19410 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19411 SDLoc DL(N);
19412 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19415 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19416 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19417 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19418 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19419 : (unsigned)ISD::SIGN_EXTEND,
19420 DL, VT, NewOp);
19421 }
19422 return SDValue();
19423}
19424
19427 const AArch64Subtarget *Subtarget) {
19428
19429 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19430 return Ext;
19432 return Ext;
19433 if (SDValue Ext = performVectorExtCombine(N, DAG))
19434 return Ext;
19435
19436 if (DCI.isBeforeLegalizeOps())
19437 return SDValue();
19438
19439 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19440 // and in MachineCombiner pass, add+mul will be combined into madd.
19441 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19442 SDLoc DL(N);
19443 EVT VT = N->getValueType(0);
19444 SDValue N0 = N->getOperand(0);
19445 SDValue N1 = N->getOperand(1);
19446 SDValue MulOper;
19447 unsigned AddSubOpc;
19448
19449 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19450 AddSubOpc = V->getOpcode();
19451 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19452 SDValue Opnd = V->getOperand(1);
19453 MulOper = V->getOperand(0);
19454 if (AddSubOpc == ISD::SUB)
19455 std::swap(Opnd, MulOper);
19456 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19457 return C->isOne();
19458 }
19459 return false;
19460 };
19461
19462 if (IsAddSubWith1(N0)) {
19463 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19464 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19465 }
19466
19467 if (IsAddSubWith1(N1)) {
19468 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19469 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19470 }
19471
19472 // The below optimizations require a constant RHS.
19473 if (!isa<ConstantSDNode>(N1))
19474 return SDValue();
19475
19477 const APInt &ConstValue = C->getAPIntValue();
19478
19479 // Allow the scaling to be folded into the `cnt` instruction by preventing
19480 // the scaling to be obscured here. This makes it easier to pattern match.
19481 if (IsSVECntIntrinsic(N0) ||
19482 (N0->getOpcode() == ISD::TRUNCATE &&
19483 (IsSVECntIntrinsic(N0->getOperand(0)))))
19484 if (ConstValue.sge(1) && ConstValue.sle(16))
19485 return SDValue();
19486
19487 // Multiplication of a power of two plus/minus one can be done more
19488 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19489 // future CPUs have a cheaper MADD instruction, this may need to be
19490 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19491 // 64-bit is 5 cycles, so this is always a win.
19492 // More aggressively, some multiplications N0 * C can be lowered to
19493 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19494 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19495 // TODO: lower more cases.
19496
19497 // TrailingZeroes is used to test if the mul can be lowered to
19498 // shift+add+shift.
19499 unsigned TrailingZeroes = ConstValue.countr_zero();
19500 if (TrailingZeroes) {
19501 // Conservatively do not lower to shift+add+shift if the mul might be
19502 // folded into smul or umul.
19503 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19504 isZeroExtended(N0, DAG)))
19505 return SDValue();
19506 // Conservatively do not lower to shift+add+shift if the mul might be
19507 // folded into madd or msub.
19508 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19509 N->user_begin()->getOpcode() == ISD::SUB))
19510 return SDValue();
19511 }
19512 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19513 // and shift+add+shift.
19514 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19515 unsigned ShiftAmt;
19516
19517 auto Shl = [&](SDValue N0, unsigned N1) {
19518 if (!N0.getNode())
19519 return SDValue();
19520 // If shift causes overflow, ignore this combine.
19521 if (N1 >= N0.getValueSizeInBits())
19522 return SDValue();
19523 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19524 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19525 };
19526 auto Add = [&](SDValue N0, SDValue N1) {
19527 if (!N0.getNode() || !N1.getNode())
19528 return SDValue();
19529 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19530 };
19531 auto Sub = [&](SDValue N0, SDValue N1) {
19532 if (!N0.getNode() || !N1.getNode())
19533 return SDValue();
19534 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19535 };
19536 auto Negate = [&](SDValue N) {
19537 if (!N0.getNode())
19538 return SDValue();
19539 SDValue Zero = DAG.getConstant(0, DL, VT);
19540 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19541 };
19542
19543 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19544 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19545 // the (2^N - 1) can't be execused via a single instruction.
19546 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19547 unsigned BitWidth = C.getBitWidth();
19548 for (unsigned i = 1; i < BitWidth / 2; i++) {
19549 APInt Rem;
19550 APInt X(BitWidth, (1 << i) + 1);
19551 APInt::sdivrem(C, X, N, Rem);
19552 APInt NVMinus1 = N - 1;
19553 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19554 M = X;
19555 return true;
19556 }
19557 }
19558 return false;
19559 };
19560
19561 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19562 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19563 // the (2^N - 1) can't be execused via a single instruction.
19564 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19565 APInt CVMinus1 = C - 1;
19566 if (CVMinus1.isNegative())
19567 return false;
19568 unsigned TrailingZeroes = CVMinus1.countr_zero();
19569 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19570 if (SCVMinus1.isPowerOf2()) {
19571 unsigned BitWidth = SCVMinus1.getBitWidth();
19572 M = APInt(BitWidth, SCVMinus1.logBase2());
19573 N = APInt(BitWidth, TrailingZeroes);
19574 return true;
19575 }
19576 return false;
19577 };
19578
19579 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19580 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19581 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19582 APInt CVMinus1 = C - 1;
19583 if (CVMinus1.isNegative())
19584 return false;
19585 unsigned TrailingZeroes = CVMinus1.countr_zero();
19586 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19587 if (CVPlus1.isPowerOf2()) {
19588 unsigned BitWidth = CVPlus1.getBitWidth();
19589 M = APInt(BitWidth, CVPlus1.logBase2());
19590 N = APInt(BitWidth, TrailingZeroes);
19591 return true;
19592 }
19593 return false;
19594 };
19595
19596 if (ConstValue.isNonNegative()) {
19597 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19598 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19599 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19600 // (mul x, (2^M + 1) * (2^N + 1))
19601 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19602 // (mul x, (2^M + 1) * 2^N + 1))
19603 // => MV = add (shl x, M), x); add (shl MV, N), x)
19604 // (mul x, 1 - (1 - 2^M) * 2^N))
19605 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19606 APInt SCVMinus1 = ShiftedConstValue - 1;
19607 APInt SCVPlus1 = ShiftedConstValue + 1;
19608 APInt CVPlus1 = ConstValue + 1;
19609 APInt CVM, CVN;
19610 if (SCVMinus1.isPowerOf2()) {
19611 ShiftAmt = SCVMinus1.logBase2();
19612 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19613 } else if (CVPlus1.isPowerOf2()) {
19614 ShiftAmt = CVPlus1.logBase2();
19615 return Sub(Shl(N0, ShiftAmt), N0);
19616 } else if (SCVPlus1.isPowerOf2()) {
19617 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19618 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19619 }
19620 if (Subtarget->hasALULSLFast() &&
19621 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19622 APInt CVMMinus1 = CVM - 1;
19623 APInt CVNMinus1 = CVN - 1;
19624 unsigned ShiftM1 = CVMMinus1.logBase2();
19625 unsigned ShiftN1 = CVNMinus1.logBase2();
19626 // ALULSLFast implicate that Shifts <= 4 places are fast
19627 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19628 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19629 return Add(Shl(MVal, ShiftN1), MVal);
19630 }
19631 }
19632 if (Subtarget->hasALULSLFast() &&
19633 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19634 unsigned ShiftM = CVM.getZExtValue();
19635 unsigned ShiftN = CVN.getZExtValue();
19636 // ALULSLFast implicate that Shifts <= 4 places are fast
19637 if (ShiftM <= 4 && ShiftN <= 4) {
19638 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19639 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19640 }
19641 }
19642
19643 if (Subtarget->hasALULSLFast() &&
19644 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19645 unsigned ShiftM = CVM.getZExtValue();
19646 unsigned ShiftN = CVN.getZExtValue();
19647 // ALULSLFast implicate that Shifts <= 4 places are fast
19648 if (ShiftM <= 4 && ShiftN <= 4) {
19649 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19650 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19651 }
19652 }
19653 } else {
19654 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19655 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19656 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19657 APInt SCVPlus1 = -ShiftedConstValue + 1;
19658 APInt CVNegPlus1 = -ConstValue + 1;
19659 APInt CVNegMinus1 = -ConstValue - 1;
19660 if (CVNegPlus1.isPowerOf2()) {
19661 ShiftAmt = CVNegPlus1.logBase2();
19662 return Sub(N0, Shl(N0, ShiftAmt));
19663 } else if (CVNegMinus1.isPowerOf2()) {
19664 ShiftAmt = CVNegMinus1.logBase2();
19665 return Negate(Add(Shl(N0, ShiftAmt), N0));
19666 } else if (SCVPlus1.isPowerOf2()) {
19667 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19668 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19669 }
19670 }
19671
19672 return SDValue();
19673}
19674
19676 SelectionDAG &DAG) {
19677 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19678 // optimize away operation when it's from a constant.
19679 //
19680 // The general transformation is:
19681 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19682 // AND(VECTOR_CMP(x,y), constant2)
19683 // constant2 = UNARYOP(constant)
19684
19685 // Early exit if this isn't a vector operation, the operand of the
19686 // unary operation isn't a bitwise AND, or if the sizes of the operations
19687 // aren't the same.
19688 EVT VT = N->getValueType(0);
19689 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19690 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19691 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19692 return SDValue();
19693
19694 // Now check that the other operand of the AND is a constant. We could
19695 // make the transformation for non-constant splats as well, but it's unclear
19696 // that would be a benefit as it would not eliminate any operations, just
19697 // perform one more step in scalar code before moving to the vector unit.
19698 if (BuildVectorSDNode *BV =
19699 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19700 // Bail out if the vector isn't a constant.
19701 if (!BV->isConstant())
19702 return SDValue();
19703
19704 // Everything checks out. Build up the new and improved node.
19705 SDLoc DL(N);
19706 EVT IntVT = BV->getValueType(0);
19707 // Create a new constant of the appropriate type for the transformed
19708 // DAG.
19709 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19710 // The AND node needs bitcasts to/from an integer vector type around it.
19711 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19712 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19713 N->getOperand(0)->getOperand(0), MaskConst);
19714 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19715 return Res;
19716 }
19717
19718 return SDValue();
19719}
19720
19721/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19722/// functions, this can help to reduce the number of fmovs to/from GPRs.
19723static SDValue
19726 const AArch64Subtarget *Subtarget) {
19727 if (N->isStrictFPOpcode())
19728 return SDValue();
19729
19730 if (DCI.isBeforeLegalizeOps())
19731 return SDValue();
19732
19733 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19734 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19735 return SDValue();
19736
19737 auto isSupportedType = [](EVT VT) {
19738 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19739 };
19740
19741 SDValue SrcVal = N->getOperand(0);
19742 EVT SrcTy = SrcVal.getValueType();
19743 EVT DestTy = N->getValueType(0);
19744
19745 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19746 return SDValue();
19747
19748 EVT SrcVecTy;
19749 EVT DestVecTy;
19750 if (DestTy.bitsGT(SrcTy)) {
19751 DestVecTy = getPackedSVEVectorVT(DestTy);
19752 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19753 } else {
19754 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19755 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19756 }
19757
19758 // Ensure the resulting src/dest vector type is legal.
19759 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19760 return SDValue();
19761
19762 SDLoc DL(N);
19763 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19764 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19765 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19766 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19767 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19768}
19769
19772 const AArch64Subtarget *Subtarget) {
19773 // First try to optimize away the conversion when it's conditionally from
19774 // a constant. Vectors only.
19776 return Res;
19777
19778 if (SDValue Res =
19779 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19780 return Res;
19781
19782 EVT VT = N->getValueType(0);
19783 if (VT != MVT::f32 && VT != MVT::f64)
19784 return SDValue();
19785
19786 // Only optimize when the source and destination types have the same width.
19787 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19788 return SDValue();
19789
19790 // If the result of an integer load is only used by an integer-to-float
19791 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19792 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19793 SDValue N0 = N->getOperand(0);
19794 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19795 N0.hasOneUse() &&
19796 // Do not change the width of a volatile load.
19797 !cast<LoadSDNode>(N0)->isVolatile()) {
19798 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19799 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19800 LN0->getPointerInfo(), LN0->getAlign(),
19801 LN0->getMemOperand()->getFlags());
19802
19803 // Make sure successors of the original load stay after it by updating them
19804 // to use the new Chain.
19805 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19806
19807 unsigned Opcode =
19808 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19809 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19810 }
19811
19812 return SDValue();
19813}
19814
19815/// Fold a floating-point multiply by power of two into floating-point to
19816/// fixed-point conversion.
19819 const AArch64Subtarget *Subtarget) {
19820 if (SDValue Res =
19821 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19822 return Res;
19823
19824 if (!Subtarget->isNeonAvailable())
19825 return SDValue();
19826
19827 if (!N->getValueType(0).isSimple())
19828 return SDValue();
19829
19830 SDValue Op = N->getOperand(0);
19831 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19832 return SDValue();
19833
19834 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19835 return SDValue();
19836
19837 SDValue ConstVec = Op->getOperand(1);
19838 if (!isa<BuildVectorSDNode>(ConstVec))
19839 return SDValue();
19840
19841 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19842 uint32_t FloatBits = FloatTy.getSizeInBits();
19843 if (FloatBits != 32 && FloatBits != 64 &&
19844 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19845 return SDValue();
19846
19847 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19848 uint32_t IntBits = IntTy.getSizeInBits();
19849 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19850 return SDValue();
19851
19852 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19853 if (IntBits > FloatBits)
19854 return SDValue();
19855
19856 BitVector UndefElements;
19858 int32_t Bits = IntBits == 64 ? 64 : 32;
19859 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19860 if (C == -1 || C == 0 || C > Bits)
19861 return SDValue();
19862
19863 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19864 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19865 return SDValue();
19866
19867 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19868 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19869 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19870 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19871 return SDValue();
19872 }
19873
19874 SDLoc DL(N);
19875 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19876 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19877 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19878 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19879 SDValue FixConv =
19881 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19882 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19883 // We can handle smaller integers by generating an extra trunc.
19884 if (IntBits < FloatBits)
19885 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19886
19887 return FixConv;
19888}
19889
19890// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19891// convert to csel(ccmp(.., cc0)), depending on cc1:
19892
19893// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19894// =>
19895// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19896//
19897// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19898// =>
19899// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19901 EVT VT = N->getValueType(0);
19902 SDValue CSel0 = N->getOperand(0);
19903 SDValue CSel1 = N->getOperand(1);
19904
19905 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19906 CSel1.getOpcode() != AArch64ISD::CSEL)
19907 return SDValue();
19908
19909 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19910 return SDValue();
19911
19912 if (!isNullConstant(CSel0.getOperand(0)) ||
19913 !isOneConstant(CSel0.getOperand(1)) ||
19914 !isNullConstant(CSel1.getOperand(0)) ||
19915 !isOneConstant(CSel1.getOperand(1)))
19916 return SDValue();
19917
19918 SDValue Cmp0 = CSel0.getOperand(3);
19919 SDValue Cmp1 = CSel1.getOperand(3);
19922 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19923 return SDValue();
19924 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19925 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19926 std::swap(Cmp0, Cmp1);
19927 std::swap(CC0, CC1);
19928 }
19929
19930 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19931 return SDValue();
19932
19933 SDLoc DL(N);
19934 SDValue CCmp, Condition;
19935 unsigned NZCV;
19936
19937 if (N->getOpcode() == ISD::AND) {
19939 Condition = getCondCode(DAG, InvCC0);
19941 } else {
19943 Condition = getCondCode(DAG, CC0);
19945 }
19946
19947 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19948
19949 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19950 if (Op1 && Op1->getAPIntValue().isNegative() &&
19951 Op1->getAPIntValue().sgt(-32)) {
19952 // CCMP accept the constant int the range [0, 31]
19953 // if the Op1 is a constant in the range [-31, -1], we
19954 // can select to CCMN to avoid the extra mov
19955 SDValue AbsOp1 =
19956 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19957 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
19958 AbsOp1, NZCVOp, Condition, Cmp0);
19959 } else {
19960 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
19961 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19962 }
19963 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19964 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
19965}
19966
19968 const AArch64Subtarget *Subtarget,
19969 const AArch64TargetLowering &TLI) {
19970 SelectionDAG &DAG = DCI.DAG;
19971
19972 if (SDValue R = performANDORCSELCombine(N, DAG))
19973 return R;
19974
19975 return SDValue();
19976}
19977
19979 if (!MemVT.getVectorElementType().isSimple())
19980 return false;
19981
19982 uint64_t MaskForTy = 0ull;
19983 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19984 case MVT::i8:
19985 MaskForTy = 0xffull;
19986 break;
19987 case MVT::i16:
19988 MaskForTy = 0xffffull;
19989 break;
19990 case MVT::i32:
19991 MaskForTy = 0xffffffffull;
19992 break;
19993 default:
19994 return false;
19995 break;
19996 }
19997
19998 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19999 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20000 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20001
20002 return false;
20003}
20004
20006 SDValue LeafOp = SDValue(N, 0);
20007 SDValue Op = N->getOperand(0);
20008 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20009 LeafOp.getValueType() != Op.getValueType())
20010 Op = Op->getOperand(0);
20011 if (LeafOp.getValueType() == Op.getValueType())
20012 return Op;
20013 return SDValue();
20014}
20015
20018 SelectionDAG &DAG = DCI.DAG;
20019 SDValue Src = N->getOperand(0);
20020 unsigned Opc = Src->getOpcode();
20021
20022 // Zero/any extend of an unsigned unpack
20023 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20024 SDValue UnpkOp = Src->getOperand(0);
20025 SDValue Dup = N->getOperand(1);
20026
20027 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20028 return SDValue();
20029
20030 SDLoc DL(N);
20032 if (!C)
20033 return SDValue();
20034
20035 uint64_t ExtVal = C->getZExtValue();
20036
20037 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20038 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20039 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20040 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20041 };
20042
20043 // If the mask is fully covered by the unpack, we don't need to push
20044 // a new AND onto the operand
20045 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20046 if (MaskAndTypeMatch(EltTy))
20047 return Src;
20048
20049 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20050 // to see if the mask is all-ones of size MemTy.
20051 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20052 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20053 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20054 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20055 if (MaskAndTypeMatch(EltTy))
20056 return Src;
20057 }
20058
20059 // Truncate to prevent a DUP with an over wide constant
20060 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20061
20062 // Otherwise, make sure we propagate the AND to the operand
20063 // of the unpack
20064 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20065 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20066
20067 SDValue And = DAG.getNode(ISD::AND, DL,
20068 UnpkOp->getValueType(0), UnpkOp, Dup);
20069
20070 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20071 }
20072
20073 if (DCI.isBeforeLegalizeOps())
20074 return SDValue();
20075
20076 // If both sides of AND operations are i1 splat_vectors then
20077 // we can produce just i1 splat_vector as the result.
20078 if (isAllActivePredicate(DAG, N->getOperand(0)))
20079 return N->getOperand(1);
20080 if (isAllActivePredicate(DAG, N->getOperand(1)))
20081 return N->getOperand(0);
20082
20084 return SDValue();
20085
20086 SDValue Mask = N->getOperand(1);
20087
20088 if (!Src.hasOneUse())
20089 return SDValue();
20090
20091 EVT MemVT;
20092
20093 // SVE load instructions perform an implicit zero-extend, which makes them
20094 // perfect candidates for combining.
20095 switch (Opc) {
20096 case AArch64ISD::LD1_MERGE_ZERO:
20097 case AArch64ISD::LDNF1_MERGE_ZERO:
20098 case AArch64ISD::LDFF1_MERGE_ZERO:
20099 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20100 break;
20101 case AArch64ISD::GLD1_MERGE_ZERO:
20102 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20103 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20104 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20105 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20106 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20107 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20108 case AArch64ISD::GLDFF1_MERGE_ZERO:
20109 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20110 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20111 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20112 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20113 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20114 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20115 case AArch64ISD::GLDNT1_MERGE_ZERO:
20116 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20117 break;
20118 default:
20119 return SDValue();
20120 }
20121
20122 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20123 return Src;
20124
20125 return SDValue();
20126}
20127
20128// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20131
20132 // This function performs an optimization on a specific pattern involving
20133 // an AND operation and SETCC (Set Condition Code) node.
20134
20135 SDValue SetCC = N->getOperand(0);
20136 EVT VT = N->getValueType(0);
20137 SelectionDAG &DAG = DCI.DAG;
20138
20139 // Checks if the current node (N) is used by any SELECT instruction and
20140 // returns an empty SDValue to avoid applying the optimization to prevent
20141 // incorrect results
20142 for (auto U : N->users())
20143 if (U->getOpcode() == ISD::SELECT)
20144 return SDValue();
20145
20146 // Check if the operand is a SETCC node with floating-point comparison
20147 if (SetCC.getOpcode() == ISD::SETCC &&
20148 SetCC.getOperand(0).getValueType() == MVT::f32) {
20149
20150 SDValue Cmp;
20152
20153 // Check if the DAG is after legalization and if we can emit the conjunction
20154 if (!DCI.isBeforeLegalize() &&
20155 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20156
20158
20159 SDLoc DL(N);
20160 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20161 DAG.getConstant(0, DL, VT),
20162 getCondCode(DAG, InvertedCC), Cmp);
20163 }
20164 }
20165 return SDValue();
20166}
20167
20170 SelectionDAG &DAG = DCI.DAG;
20171 SDValue LHS = N->getOperand(0);
20172 SDValue RHS = N->getOperand(1);
20173 EVT VT = N->getValueType(0);
20174
20175 if (SDValue R = performANDORCSELCombine(N, DAG))
20176 return R;
20177
20178 if (SDValue R = performANDSETCCCombine(N,DCI))
20179 return R;
20180
20181 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20182 return SDValue();
20183
20184 if (VT.isScalableVector())
20185 return performSVEAndCombine(N, DCI);
20186
20187 // The combining code below works only for NEON vectors. In particular, it
20188 // does not work for SVE when dealing with vectors wider than 128 bits.
20189 if (!VT.is64BitVector() && !VT.is128BitVector())
20190 return SDValue();
20191
20193 if (!BVN)
20194 return SDValue();
20195
20196 // AND does not accept an immediate, so check if we can use a BIC immediate
20197 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20198 // pattern in isel, because some immediates may be lowered to the preferred
20199 // (and x, (movi imm)) form, even though an mvni representation also exists.
20200 APInt DefBits(VT.getSizeInBits(), 0);
20201 APInt UndefBits(VT.getSizeInBits(), 0);
20202 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20203 SDValue NewOp;
20204
20205 // Any bits known to already be 0 need not be cleared again, which can help
20206 // reduce the size of the immediate to one supported by the instruction.
20207 KnownBits Known = DAG.computeKnownBits(LHS);
20208 APInt ZeroSplat(VT.getSizeInBits(), 0);
20209 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20210 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20211 << (Known.Zero.getBitWidth() * I);
20212
20213 DefBits = ~(DefBits | ZeroSplat);
20214 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20215 DefBits, &LHS)) ||
20216 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20217 DefBits, &LHS)))
20218 return NewOp;
20219
20220 UndefBits = ~(UndefBits | ZeroSplat);
20221 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20222 UndefBits, &LHS)) ||
20223 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20224 UndefBits, &LHS)))
20225 return NewOp;
20226 }
20227
20228 return SDValue();
20229}
20230
20233 SelectionDAG &DAG = DCI.DAG;
20234 SDValue LHS = N->getOperand(0);
20235 SDValue RHS = N->getOperand(1);
20236 EVT VT = N->getValueType(0);
20237 SDLoc DL(N);
20238
20239 if (!N->getFlags().hasAllowReassociation())
20240 return SDValue();
20241
20242 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20243 auto ReassocComplex = [&](SDValue A, SDValue B) {
20244 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20245 return SDValue();
20246 unsigned Opc = A.getConstantOperandVal(0);
20247 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20248 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20249 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20250 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20251 return SDValue();
20252 SDValue VCMLA = DAG.getNode(
20253 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20254 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20255 A.getOperand(2), A.getOperand(3));
20256 VCMLA->setFlags(A->getFlags());
20257 return VCMLA;
20258 };
20259 if (SDValue R = ReassocComplex(LHS, RHS))
20260 return R;
20261 if (SDValue R = ReassocComplex(RHS, LHS))
20262 return R;
20263
20264 return SDValue();
20265}
20266
20267static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20268 switch (Opcode) {
20269 case ISD::STRICT_FADD:
20270 case ISD::FADD:
20271 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20272 case ISD::ADD:
20273 return VT == MVT::i64;
20274 default:
20275 return false;
20276 }
20277}
20278
20279static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20281
20283 if ((N.getOpcode() == ISD::SETCC) ||
20284 // get_active_lane_mask is lowered to a whilelo instruction.
20285 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20286 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20287 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20288 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20289 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20290 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20291 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20292 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20293 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20294 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20295 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20296 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20297 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20298 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20299 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20300 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20301 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20302 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20303 return true;
20304
20305 return false;
20306}
20307
20308// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20309// ... into: "ptrue p, all" + PTEST
20310static SDValue
20313 const AArch64Subtarget *Subtarget) {
20314 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20315 // Make sure PTEST can be legalised with illegal types.
20316 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20317 return SDValue();
20318
20319 SDValue N0 = N->getOperand(0);
20320 EVT VT = N0.getValueType();
20321
20322 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20323 !isNullConstant(N->getOperand(1)))
20324 return SDValue();
20325
20326 // Restricted the DAG combine to only cases where we're extracting from a
20327 // flag-setting operation.
20328 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20329 return SDValue();
20330
20331 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20332 SelectionDAG &DAG = DCI.DAG;
20333 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20334 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20335}
20336
20337// Materialize : Idx = (add (mul vscale, NumEls), -1)
20338// i1 = extract_vector_elt t37, Constant:i64<Idx>
20339// ... into: "ptrue p, all" + PTEST
20340static SDValue
20343 const AArch64Subtarget *Subtarget) {
20344 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20345 // Make sure PTEST is legal types.
20346 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20347 return SDValue();
20348
20349 SDValue N0 = N->getOperand(0);
20350 EVT OpVT = N0.getValueType();
20351
20352 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20353 return SDValue();
20354
20355 // Idx == (add (mul vscale, NumEls), -1)
20356 SDValue Idx = N->getOperand(1);
20357 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20358 return SDValue();
20359
20360 SDValue VS = Idx.getOperand(0);
20361 if (VS.getOpcode() != ISD::VSCALE)
20362 return SDValue();
20363
20364 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20365 if (VS.getConstantOperandVal(0) != NumEls)
20366 return SDValue();
20367
20368 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20369 SelectionDAG &DAG = DCI.DAG;
20370 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20371 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20372}
20373
20374static SDValue
20376 const AArch64Subtarget *Subtarget) {
20377 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20378 SelectionDAG &DAG = DCI.DAG;
20379 SDValue Vec = N->getOperand(0);
20380 SDValue Idx = N->getOperand(1);
20381
20382 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20383 return SDValue();
20384
20385 // Only legal for 8, 16, 32, and 64 bit element types.
20386 EVT EltVT = Vec.getValueType().getVectorElementType();
20387 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20388 MVT::bf16, MVT::f32, MVT::f64}),
20389 EltVT.getSimpleVT().SimpleTy))
20390 return SDValue();
20391
20392 SDValue Mask = Idx.getOperand(0);
20393 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20394 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20395 return SDValue();
20396
20397 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20398 Vec);
20399}
20400
20401static SDValue
20403 const AArch64Subtarget *Subtarget) {
20404 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20405 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20406 return Res;
20407 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20408 return Res;
20409 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20410 return Res;
20411
20412 SelectionDAG &DAG = DCI.DAG;
20413 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20414
20415 EVT VT = N->getValueType(0);
20416 const bool FullFP16 = Subtarget->hasFullFP16();
20417 bool IsStrict = N0->isStrictFPOpcode();
20418
20419 // extract(dup x) -> x
20420 if (N0.getOpcode() == AArch64ISD::DUP)
20421 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20422 : N0.getOperand(0);
20423
20424 // Rewrite for pairwise fadd pattern
20425 // (f32 (extract_vector_elt
20426 // (fadd (vXf32 Other)
20427 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20428 // ->
20429 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20430 // (extract_vector_elt (vXf32 Other) 1))
20431 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20432 // we can only do this when it's used only by the extract_vector_elt.
20433 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20434 (!IsStrict || N0.hasOneUse())) {
20435 SDLoc DL(N0);
20436 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20437 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20438
20440 SDValue Other = N00;
20441
20442 // And handle the commutative case.
20443 if (!Shuffle) {
20444 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20445 Other = N01;
20446 }
20447
20448 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20449 Other == Shuffle->getOperand(0)) {
20450 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20451 DAG.getConstant(0, DL, MVT::i64));
20452 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20453 DAG.getConstant(1, DL, MVT::i64));
20454 if (!IsStrict)
20455 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20456
20457 // For strict_fadd we need uses of the final extract_vector to be replaced
20458 // with the strict_fadd, but we also need uses of the chain output of the
20459 // original strict_fadd to use the chain output of the new strict_fadd as
20460 // otherwise it may not be deleted.
20461 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20462 {VT, MVT::Other},
20463 {N0->getOperand(0), Extract1, Extract2});
20464 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20465 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20466 return SDValue(N, 0);
20467 }
20468 }
20469
20470 return SDValue();
20471}
20472
20475 SelectionDAG &DAG) {
20476 SDLoc DL(N);
20477 EVT VT = N->getValueType(0);
20478 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20479 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20480
20481 if (VT.isScalableVector())
20482 return SDValue();
20483
20484 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20485 N1Opc == ISD::TRUNCATE) {
20486 SDValue N00 = N0->getOperand(0);
20487 SDValue N10 = N1->getOperand(0);
20488 EVT N00VT = N00.getValueType();
20489 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20490
20491 // Optimize concat_vectors of truncated vectors, where the intermediate
20492 // type is illegal, to avoid said illegality, e.g.,
20493 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20494 // (v2i16 (truncate (v2i64)))))
20495 // ->
20496 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20497 // (v4i32 (bitcast (v2i64))),
20498 // <0, 2, 4, 6>)))
20499 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20500 // on both input and result type, so we might generate worse code.
20501 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20502 if (N00VT == N10.getValueType() &&
20503 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20504 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20505 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20507 for (size_t i = 0; i < Mask.size(); ++i)
20508 Mask[i] = i * 2;
20509 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20510 DAG.getVectorShuffle(
20511 MidVT, DL,
20512 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20513 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20514 }
20515
20516 // Optimize two large shifts and a combine into a single combine and shift
20517 // For AArch64 architectures, sequences like the following:
20518 //
20519 // ushr v0.4s, v0.4s, #20
20520 // ushr v1.4s, v1.4s, #20
20521 // uzp1 v0.8h, v0.8h, v1.8h
20522 //
20523 // Can be optimized to:
20524 //
20525 // uzp2 v0.8h, v0.8h, v1.8h
20526 // ushr v0.8h, v0.8h, #4
20527 //
20528 // This optimization reduces instruction count.
20529 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20530 N00->getOperand(1) == N10->getOperand(1)) {
20531 SDValue N000 = N00->getOperand(0);
20532 SDValue N100 = N10->getOperand(0);
20533 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20534 N101ConstVal = N10->getConstantOperandVal(1),
20535 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20536
20537 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20538 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20539 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20540 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20541 SDValue NewShiftConstant =
20542 DAG.getConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20543
20544 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20545 }
20546 }
20547 }
20548
20549 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20550 N->getOperand(0).getValueType() == MVT::v2i16 ||
20551 N->getOperand(0).getValueType() == MVT::v2i8) {
20552 EVT SrcVT = N->getOperand(0).getValueType();
20553 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20554 // loads to prevent having to go through the v4i8 load legalization that
20555 // needs to extend each element into a larger type.
20556 if (N->getNumOperands() % 2 == 0 &&
20557 all_of(N->op_values(), [SrcVT](SDValue V) {
20558 if (V.getValueType() != SrcVT)
20559 return false;
20560 if (V.isUndef())
20561 return true;
20562 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20563 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20564 LD->getExtensionType() == ISD::NON_EXTLOAD;
20565 })) {
20566 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20567 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20569
20570 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20571 SDValue V = N->getOperand(i);
20572 if (V.isUndef())
20573 Ops.push_back(DAG.getUNDEF(FVT));
20574 else {
20576 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20577 LD->getBasePtr(), LD->getMemOperand());
20578 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20579 Ops.push_back(NewLoad);
20580 }
20581 }
20582 return DAG.getBitcast(N->getValueType(0),
20583 DAG.getBuildVector(NVT, DL, Ops));
20584 }
20585 }
20586
20587 // Canonicalise concat_vectors to replace concatenations of truncated nots
20588 // with nots of concatenated truncates. This in some cases allows for multiple
20589 // redundant negations to be eliminated.
20590 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20591 // (v4i16 (truncate (not (v4i32)))))
20592 // ->
20593 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20594 // (v4i16 (truncate (v4i32)))))
20595 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20596 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20597 N->isOnlyUserOf(N1.getNode())) {
20598 auto isBitwiseVectorNegate = [](SDValue V) {
20599 return V->getOpcode() == ISD::XOR &&
20600 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20601 };
20602 SDValue N00 = N0->getOperand(0);
20603 SDValue N10 = N1->getOperand(0);
20604 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20605 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20606 return DAG.getNOT(
20607 DL,
20610 N00->getOperand(0)),
20612 N10->getOperand(0))),
20613 VT);
20614 }
20615 }
20616
20617 // Wait till after everything is legalized to try this. That way we have
20618 // legal vector types and such.
20619 if (DCI.isBeforeLegalizeOps())
20620 return SDValue();
20621
20622 // Optimise concat_vectors of two identical binops with a 128-bit destination
20623 // size, combine into an binop of two contacts of the source vectors. eg:
20624 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20625 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20626 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20627 isVectorizedBinOp(N0Opc)) &&
20628 N0->hasOneUse() && N1->hasOneUse()) {
20629 SDValue N00 = N0->getOperand(0);
20630 SDValue N01 = N0->getOperand(1);
20631 SDValue N10 = N1->getOperand(0);
20632 SDValue N11 = N1->getOperand(1);
20633
20634 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20635 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20636 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20637 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20638 }
20639 }
20640
20641 auto IsRSHRN = [](SDValue Shr) {
20642 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20643 return false;
20644 SDValue Op = Shr.getOperand(0);
20645 EVT VT = Op.getValueType();
20646 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20647 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20648 return false;
20649
20650 APInt Imm;
20651 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20652 Imm = APInt(VT.getScalarSizeInBits(),
20653 Op.getOperand(1).getConstantOperandVal(0)
20654 << Op.getOperand(1).getConstantOperandVal(1));
20655 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20656 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20657 Imm = APInt(VT.getScalarSizeInBits(),
20658 Op.getOperand(1).getConstantOperandVal(0));
20659 else
20660 return false;
20661
20662 if (Imm != 1ULL << (ShtAmt - 1))
20663 return false;
20664 return true;
20665 };
20666
20667 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20668 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20669 ((IsRSHRN(N1) &&
20671 N1.isUndef())) {
20672 SDValue X = N0.getOperand(0).getOperand(0);
20673 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20674 : N1.getOperand(0).getOperand(0);
20675 EVT BVT =
20676 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20677 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20678 SDValue Add = DAG.getNode(
20679 ISD::ADD, DL, BVT, CC,
20680 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20681 SDValue Shr =
20682 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20683 return Shr;
20684 }
20685
20686 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20687 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20688 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20689 N0.getOperand(1) == N1.getOperand(1)) {
20690 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20691 DAG.getUNDEF(N0.getValueType()));
20692 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20693 DAG.getUNDEF(N0.getValueType()));
20694 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20695 }
20696
20697 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20698 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20699 // canonicalise to that.
20700 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20701 assert(VT.getScalarSizeInBits() == 64);
20702 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20703 DAG.getConstant(0, DL, MVT::i64));
20704 }
20705
20706 // Canonicalise concat_vectors so that the right-hand vector has as few
20707 // bit-casts as possible before its real operation. The primary matching
20708 // destination for these operations will be the narrowing "2" instructions,
20709 // which depend on the operation being performed on this right-hand vector.
20710 // For example,
20711 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20712 // becomes
20713 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20714
20715 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20716 return SDValue();
20717 SDValue RHS = N1->getOperand(0);
20718 MVT RHSTy = RHS.getValueType().getSimpleVT();
20719 // If the RHS is not a vector, this is not the pattern we're looking for.
20720 if (!RHSTy.isVector())
20721 return SDValue();
20722
20723 LLVM_DEBUG(
20724 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20725
20726 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20727 RHSTy.getVectorNumElements() * 2);
20728 return DAG.getNode(ISD::BITCAST, DL, VT,
20729 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
20730 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
20731 RHS));
20732}
20733
20734static SDValue
20736 SelectionDAG &DAG) {
20737 if (DCI.isBeforeLegalizeOps())
20738 return SDValue();
20739
20740 EVT VT = N->getValueType(0);
20741 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20742 return SDValue();
20743
20744 SDValue V = N->getOperand(0);
20745
20746 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20747 // blocks this combine because the non-const case requires custom lowering.
20748 //
20749 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20750 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20751 if (isa<ConstantSDNode>(V.getOperand(0)))
20752 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20753
20754 return SDValue();
20755}
20756
20757static SDValue
20759 SelectionDAG &DAG) {
20760 SDLoc DL(N);
20761 SDValue Vec = N->getOperand(0);
20762 SDValue SubVec = N->getOperand(1);
20763 uint64_t IdxVal = N->getConstantOperandVal(2);
20764 EVT VecVT = Vec.getValueType();
20765 EVT SubVT = SubVec.getValueType();
20766
20767 // Promote fixed length vector zeros.
20768 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20769 Vec.isUndef() && isZerosVector(SubVec.getNode()))
20770 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
20771 : DAG.getConstantFP(0, DL, VecVT);
20772
20773 // Only do this for legal fixed vector types.
20774 if (!VecVT.isFixedLengthVector() ||
20775 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20776 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20777 return SDValue();
20778
20779 // Ignore widening patterns.
20780 if (IdxVal == 0 && Vec.isUndef())
20781 return SDValue();
20782
20783 // Subvector must be half the width and an "aligned" insertion.
20784 unsigned NumSubElts = SubVT.getVectorNumElements();
20785 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20786 (IdxVal != 0 && IdxVal != NumSubElts))
20787 return SDValue();
20788
20789 // Fold insert_subvector -> concat_vectors
20790 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20791 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20792 SDValue Lo, Hi;
20793 if (IdxVal == 0) {
20794 Lo = SubVec;
20795 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20796 DAG.getVectorIdxConstant(NumSubElts, DL));
20797 } else {
20798 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20799 DAG.getVectorIdxConstant(0, DL));
20800 Hi = SubVec;
20801 }
20802 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20803}
20804
20807 SelectionDAG &DAG) {
20808 // Wait until after everything is legalized to try this. That way we have
20809 // legal vector types and such.
20810 if (DCI.isBeforeLegalizeOps())
20811 return SDValue();
20812 // Transform a scalar conversion of a value from a lane extract into a
20813 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20814 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20815 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20816 //
20817 // The second form interacts better with instruction selection and the
20818 // register allocator to avoid cross-class register copies that aren't
20819 // coalescable due to a lane reference.
20820
20821 // Check the operand and see if it originates from a lane extract.
20822 SDValue Op1 = N->getOperand(1);
20824 return SDValue();
20825
20826 // Yep, no additional predication needed. Perform the transform.
20827 SDValue IID = N->getOperand(0);
20828 SDValue Shift = N->getOperand(2);
20829 SDValue Vec = Op1.getOperand(0);
20830 SDValue Lane = Op1.getOperand(1);
20831 EVT ResTy = N->getValueType(0);
20832 EVT VecResTy;
20833 SDLoc DL(N);
20834
20835 // The vector width should be 128 bits by the time we get here, even
20836 // if it started as 64 bits (the extract_vector handling will have
20837 // done so). Bail if it is not.
20838 if (Vec.getValueSizeInBits() != 128)
20839 return SDValue();
20840
20841 if (Vec.getValueType() == MVT::v4i32)
20842 VecResTy = MVT::v4f32;
20843 else if (Vec.getValueType() == MVT::v2i64)
20844 VecResTy = MVT::v2f64;
20845 else
20846 return SDValue();
20847
20848 SDValue Convert =
20849 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20850 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20851}
20852
20853// AArch64 high-vector "long" operations are formed by performing the non-high
20854// version on an extract_subvector of each operand which gets the high half:
20855//
20856// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20857//
20858// However, there are cases which don't have an extract_high explicitly, but
20859// have another operation that can be made compatible with one for free. For
20860// example:
20861//
20862// (dupv64 scalar) --> (extract_high (dup128 scalar))
20863//
20864// This routine does the actual conversion of such DUPs, once outer routines
20865// have determined that everything else is in order.
20866// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20867// similarly here.
20869 MVT VT = N.getSimpleValueType();
20870 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20871 N.getConstantOperandVal(1) == 0)
20872 N = N.getOperand(0);
20873
20874 switch (N.getOpcode()) {
20875 case AArch64ISD::DUP:
20876 case AArch64ISD::DUPLANE8:
20877 case AArch64ISD::DUPLANE16:
20878 case AArch64ISD::DUPLANE32:
20879 case AArch64ISD::DUPLANE64:
20880 case AArch64ISD::MOVI:
20881 case AArch64ISD::MOVIshift:
20882 case AArch64ISD::MOVIedit:
20883 case AArch64ISD::MOVImsl:
20884 case AArch64ISD::MVNIshift:
20885 case AArch64ISD::MVNImsl:
20886 break;
20887 default:
20888 // FMOV could be supported, but isn't very useful, as it would only occur
20889 // if you passed a bitcast' floating point immediate to an eligible long
20890 // integer op (addl, smull, ...).
20891 return SDValue();
20892 }
20893
20894 if (!VT.is64BitVector())
20895 return SDValue();
20896
20897 SDLoc DL(N);
20898 unsigned NumElems = VT.getVectorNumElements();
20899 if (N.getValueType().is64BitVector()) {
20900 MVT ElementTy = VT.getVectorElementType();
20901 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20902 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20903 }
20904
20905 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20906 DAG.getConstant(NumElems, DL, MVT::i64));
20907}
20908
20910 if (N.getOpcode() == ISD::BITCAST)
20911 N = N.getOperand(0);
20912 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20913 return false;
20914 if (N.getOperand(0).getValueType().isScalableVector())
20915 return false;
20916 return N.getConstantOperandAPInt(1) ==
20917 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20918}
20919
20920/// Helper structure to keep track of ISD::SET_CC operands.
20926
20927/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20932
20933/// Helper structure to keep track of SetCC information.
20938
20939/// Helper structure to be able to read SetCC information. If set to
20940/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20941/// GenericSetCCInfo.
20946
20947/// Check whether or not \p Op is a SET_CC operation, either a generic or
20948/// an
20949/// AArch64 lowered one.
20950/// \p SetCCInfo is filled accordingly.
20951/// \post SetCCInfo is meanginfull only when this function returns true.
20952/// \return True when Op is a kind of SET_CC operation.
20954 // If this is a setcc, this is straight forward.
20955 if (Op.getOpcode() == ISD::SETCC) {
20956 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20957 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20958 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20959 SetCCInfo.IsAArch64 = false;
20960 return true;
20961 }
20962 // Otherwise, check if this is a matching csel instruction.
20963 // In other words:
20964 // - csel 1, 0, cc
20965 // - csel 0, 1, !cc
20966 if (Op.getOpcode() != AArch64ISD::CSEL)
20967 return false;
20968 // Set the information about the operands.
20969 // TODO: we want the operands of the Cmp not the csel
20970 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20971 SetCCInfo.IsAArch64 = true;
20972 SetCCInfo.Info.AArch64.CC =
20973 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20974
20975 // Check that the operands matches the constraints:
20976 // (1) Both operands must be constants.
20977 // (2) One must be 1 and the other must be 0.
20978 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20979 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20980
20981 // Check (1).
20982 if (!TValue || !FValue)
20983 return false;
20984
20985 // Check (2).
20986 if (!TValue->isOne()) {
20987 // Update the comparison when we are interested in !cc.
20988 std::swap(TValue, FValue);
20989 SetCCInfo.Info.AArch64.CC =
20991 }
20992 return TValue->isOne() && FValue->isZero();
20993}
20994
20995// Returns true if Op is setcc or zext of setcc.
20997 if (isSetCC(Op, Info))
20998 return true;
20999 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
21000 isSetCC(Op->getOperand(0), Info));
21001}
21002
21003// The folding we want to perform is:
21004// (add x, [zext] (setcc cc ...) )
21005// -->
21006// (csel x, (add x, 1), !cc ...)
21007//
21008// The latter will get matched to a CSINC instruction.
21010 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
21011 SDValue LHS = Op->getOperand(0);
21012 SDValue RHS = Op->getOperand(1);
21013 SetCCInfoAndKind InfoAndKind;
21014
21015 // If both operands are a SET_CC, then we don't want to perform this
21016 // folding and create another csel as this results in more instructions
21017 // (and higher register usage).
21018 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
21019 isSetCCOrZExtSetCC(RHS, InfoAndKind))
21020 return SDValue();
21021
21022 // If neither operand is a SET_CC, give up.
21023 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
21024 std::swap(LHS, RHS);
21025 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
21026 return SDValue();
21027 }
21028
21029 // FIXME: This could be generatized to work for FP comparisons.
21030 EVT CmpVT = InfoAndKind.IsAArch64
21031 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
21032 : InfoAndKind.Info.Generic.Opnd0->getValueType();
21033 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
21034 return SDValue();
21035
21036 SDValue CCVal;
21037 SDValue Cmp;
21038 SDLoc DL(Op);
21039 if (InfoAndKind.IsAArch64) {
21040 CCVal = DAG.getConstant(
21042 MVT::i32);
21043 Cmp = *InfoAndKind.Info.AArch64.Cmp;
21044 } else
21045 Cmp = getAArch64Cmp(
21046 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
21047 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
21048 DL);
21049
21050 EVT VT = Op->getValueType(0);
21051 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
21052 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
21053}
21054
21055// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
21057 EVT VT = N->getValueType(0);
21058 // Only scalar integer and vector types.
21059 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
21060 return SDValue();
21061
21062 SDValue LHS = N->getOperand(0);
21063 SDValue RHS = N->getOperand(1);
21064 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21065 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
21066 return SDValue();
21067
21068 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
21069 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
21070 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
21071 return SDValue();
21072
21073 SDValue Op1 = LHS->getOperand(0);
21074 SDValue Op2 = RHS->getOperand(0);
21075 EVT OpVT1 = Op1.getValueType();
21076 EVT OpVT2 = Op2.getValueType();
21077 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
21078 Op2.getOpcode() != AArch64ISD::UADDV ||
21079 OpVT1.getVectorElementType() != VT)
21080 return SDValue();
21081
21082 SDValue Val1 = Op1.getOperand(0);
21083 SDValue Val2 = Op2.getOperand(0);
21084 EVT ValVT = Val1->getValueType(0);
21085 SDLoc DL(N);
21086 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
21087 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21088 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
21089 DAG.getConstant(0, DL, MVT::i64));
21090}
21091
21092/// Perform the scalar expression combine in the form of:
21093/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
21094/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
21096 EVT VT = N->getValueType(0);
21097 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
21098 return SDValue();
21099
21100 SDValue LHS = N->getOperand(0);
21101 SDValue RHS = N->getOperand(1);
21102
21103 // Handle commutivity.
21104 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21105 LHS.getOpcode() != AArch64ISD::CSNEG) {
21106 std::swap(LHS, RHS);
21107 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21108 LHS.getOpcode() != AArch64ISD::CSNEG) {
21109 return SDValue();
21110 }
21111 }
21112
21113 if (!LHS.hasOneUse())
21114 return SDValue();
21115
21117 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
21118
21119 // The CSEL should include a const one operand, and the CSNEG should include
21120 // One or NegOne operand.
21121 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
21122 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
21123 if (!CTVal || !CFVal)
21124 return SDValue();
21125
21126 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
21127 (CTVal->isOne() || CFVal->isOne())) &&
21128 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
21129 (CTVal->isOne() || CFVal->isAllOnes())))
21130 return SDValue();
21131
21132 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
21133 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
21134 !CFVal->isOne()) {
21135 std::swap(CTVal, CFVal);
21137 }
21138
21139 SDLoc DL(N);
21140 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
21141 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
21142 !CFVal->isAllOnes()) {
21143 APInt C = -1 * CFVal->getAPIntValue();
21144 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
21145 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
21147 }
21148
21149 // It might be neutral for larger constants, as the immediate need to be
21150 // materialized in a register.
21151 APInt ADDC = CTVal->getAPIntValue();
21152 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21153 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
21154 return SDValue();
21155
21156 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
21157 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
21158 "Unexpected constant value");
21159
21160 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
21161 SDValue CCVal = getCondCode(DAG, AArch64CC);
21162 SDValue Cmp = LHS.getOperand(3);
21163
21164 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
21165}
21166
21167// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21169 EVT VT = N->getValueType(0);
21170 if (N->getOpcode() != ISD::ADD)
21171 return SDValue();
21172
21173 SDValue Dot = N->getOperand(0);
21174 SDValue A = N->getOperand(1);
21175 // Handle commutivity
21176 auto isZeroDot = [](SDValue Dot) {
21177 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21178 Dot.getOpcode() == AArch64ISD::SDOT) &&
21180 };
21181 if (!isZeroDot(Dot))
21182 std::swap(Dot, A);
21183 if (!isZeroDot(Dot))
21184 return SDValue();
21185
21186 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21187 Dot.getOperand(2));
21188}
21189
21191 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21192}
21193
21194// Try to fold
21195//
21196// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21197//
21198// The folding helps csel to be matched with csneg without generating
21199// redundant neg instruction, which includes negation of the csel expansion
21200// of abs node lowered by lowerABS.
21202 if (!isNegatedInteger(SDValue(N, 0)))
21203 return SDValue();
21204
21205 SDValue CSel = N->getOperand(1);
21206 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21207 return SDValue();
21208
21209 SDValue N0 = CSel.getOperand(0);
21210 SDValue N1 = CSel.getOperand(1);
21211
21212 // If neither of them are negations, it's not worth the folding as it
21213 // introduces two additional negations while reducing one negation.
21214 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21215 return SDValue();
21216
21217 SDLoc DL(N);
21218 EVT VT = CSel.getValueType();
21219
21220 SDValue N0N = DAG.getNegative(N0, DL, VT);
21221 SDValue N1N = DAG.getNegative(N1, DL, VT);
21222
21223 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21224 CSel.getOperand(3));
21225}
21226
21227// The basic add/sub long vector instructions have variants with "2" on the end
21228// which act on the high-half of their inputs. They are normally matched by
21229// patterns like:
21230//
21231// (add (zeroext (extract_high LHS)),
21232// (zeroext (extract_high RHS)))
21233// -> uaddl2 vD, vN, vM
21234//
21235// However, if one of the extracts is something like a duplicate, this
21236// instruction can still be used profitably. This function puts the DAG into a
21237// more appropriate form for those patterns to trigger.
21240 SelectionDAG &DAG = DCI.DAG;
21241 if (DCI.isBeforeLegalizeOps())
21242 return SDValue();
21243
21244 MVT VT = N->getSimpleValueType(0);
21245 if (!VT.is128BitVector()) {
21246 if (N->getOpcode() == ISD::ADD)
21247 return performSetccAddFolding(N, DAG);
21248 return SDValue();
21249 }
21250
21251 // Make sure both branches are extended in the same way.
21252 SDValue LHS = N->getOperand(0);
21253 SDValue RHS = N->getOperand(1);
21254 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21255 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21256 LHS.getOpcode() != RHS.getOpcode())
21257 return SDValue();
21258
21259 unsigned ExtType = LHS.getOpcode();
21260
21261 // It's not worth doing if at least one of the inputs isn't already an
21262 // extract, but we don't know which it'll be so we have to try both.
21263 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
21264 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
21265 if (!RHS.getNode())
21266 return SDValue();
21267
21268 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
21269 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
21270 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
21271 if (!LHS.getNode())
21272 return SDValue();
21273
21274 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
21275 }
21276
21277 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
21278}
21279
21280static bool isCMP(SDValue Op) {
21281 return Op.getOpcode() == AArch64ISD::SUBS &&
21282 !Op.getNode()->hasAnyUseOfValue(0);
21283}
21284
21285// (CSEL 1 0 CC Cond) => CC
21286// (CSEL 0 1 CC Cond) => !CC
21287static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21288 if (Op.getOpcode() != AArch64ISD::CSEL)
21289 return std::nullopt;
21290 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21291 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21292 return std::nullopt;
21293 SDValue OpLHS = Op.getOperand(0);
21294 SDValue OpRHS = Op.getOperand(1);
21295 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21296 return CC;
21297 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21298 return getInvertedCondCode(CC);
21299
21300 return std::nullopt;
21301}
21302
21303// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21304// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21305static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21306 SDValue CmpOp = Op->getOperand(2);
21307 if (!isCMP(CmpOp))
21308 return SDValue();
21309
21310 if (IsAdd) {
21311 if (!isOneConstant(CmpOp.getOperand(1)))
21312 return SDValue();
21313 } else {
21314 if (!isNullConstant(CmpOp.getOperand(0)))
21315 return SDValue();
21316 }
21317
21318 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21319 auto CC = getCSETCondCode(CsetOp);
21320 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21321 return SDValue();
21322
21323 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21324 Op->getOperand(0), Op->getOperand(1),
21325 CsetOp.getOperand(3));
21326}
21327
21328// (ADC x 0 cond) => (CINC x HS cond)
21330 SDValue LHS = N->getOperand(0);
21331 SDValue RHS = N->getOperand(1);
21332 SDValue Cond = N->getOperand(2);
21333
21334 if (!isNullConstant(RHS))
21335 return SDValue();
21336
21337 EVT VT = N->getValueType(0);
21338 SDLoc DL(N);
21339
21340 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21342 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21343}
21344
21347 SelectionDAG &DAG) {
21348 SDLoc DL(N);
21349 EVT VT = N->getValueType(0);
21350
21352 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
21353 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
21354 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
21355 if (Elt0->getOpcode() == ISD::FP_ROUND &&
21356 Elt1->getOpcode() == ISD::FP_ROUND &&
21357 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21358 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21359 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
21361 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21362 // Constant index.
21364 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21365 Elt0->getOperand(0)->getOperand(0) ==
21366 Elt1->getOperand(0)->getOperand(0) &&
21367 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
21368 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
21369 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
21370 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
21371 SDValue HighLanes;
21372 if (Elt2->getOpcode() == ISD::UNDEF &&
21373 Elt3->getOpcode() == ISD::UNDEF) {
21374 HighLanes = DAG.getUNDEF(MVT::v2f32);
21375 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
21376 Elt3->getOpcode() == ISD::FP_ROUND &&
21377 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
21378 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
21379 Elt2->getConstantOperandVal(1) ==
21380 Elt3->getConstantOperandVal(1) &&
21381 Elt2->getOperand(0)->getOpcode() ==
21383 Elt3->getOperand(0)->getOpcode() ==
21385 // Constant index.
21386 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
21387 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
21388 Elt2->getOperand(0)->getOperand(0) ==
21389 Elt3->getOperand(0)->getOperand(0) &&
21390 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
21391 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
21392 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
21393 HighLanes =
21394 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
21395 }
21396 if (HighLanes) {
21397 SDValue DoubleToSingleSticky =
21398 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
21399 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
21400 DoubleToSingleSticky, HighLanes);
21401 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
21402 Elt0->getOperand(1));
21403 }
21404 }
21405 }
21406 }
21407
21408 if (VT == MVT::v2f64) {
21409 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21410 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
21411 Elt1->getOpcode() == ISD::FP_EXTEND &&
21413 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21414 Elt0->getOperand(0)->getOperand(0) ==
21415 Elt1->getOperand(0)->getOperand(0) &&
21416 // Constant index.
21418 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21419 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
21420 Elt1->getOperand(0)->getConstantOperandVal(1) &&
21421 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21422 // ResultType's known minimum vector length.
21423 Elt0->getOperand(0)->getConstantOperandVal(1) %
21425 0) {
21426 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
21427 if (SrcVec.getValueType() == MVT::v4f16 ||
21428 SrcVec.getValueType() == MVT::v4bf16) {
21429 SDValue HalfToSingle =
21430 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
21431 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
21432 SDValue Extract = DAG.getNode(
21434 HalfToSingle, SubvectorIdx);
21435 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
21436 }
21437 }
21438 }
21439
21440 // A build vector of two extracted elements is equivalent to an
21441 // extract subvector where the inner vector is any-extended to the
21442 // extract_vector_elt VT.
21443 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
21444 // (extract_elt_iXX_to_i32 vec Idx+1))
21445 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
21446
21447 // For now, only consider the v2i32 case, which arises as a result of
21448 // legalization.
21449 if (VT != MVT::v2i32)
21450 return SDValue();
21451
21452 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21453 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
21454 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21455 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21456 // Constant index.
21457 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21458 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21459 // Both EXTRACT_VECTOR_ELT from same vector...
21460 Elt0->getOperand(0) == Elt1->getOperand(0) &&
21461 // ... and contiguous. First element's index +1 == second element's index.
21462 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
21463 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21464 // ResultType's known minimum vector length.
21465 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
21466 SDValue VecToExtend = Elt0->getOperand(0);
21467 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
21468 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
21469 return SDValue();
21470
21471 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
21472
21473 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
21474 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
21475 SubvectorIdx);
21476 }
21477
21478 return SDValue();
21479}
21480
21481// A special combine for the sqdmulh family of instructions.
21482// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21483// SATURATING_VAL ) can be reduced to sqdmulh(...)
21485
21486 if (N->getOpcode() != ISD::SMIN)
21487 return SDValue();
21488
21489 EVT DestVT = N->getValueType(0);
21490
21491 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21492 DestVT.isScalableVector())
21493 return SDValue();
21494
21495 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21496
21497 if (!Clamp)
21498 return SDValue();
21499
21500 MVT ScalarType;
21501 unsigned ShiftAmt = 0;
21502 switch (Clamp->getSExtValue()) {
21503 case (1ULL << 15) - 1:
21504 ScalarType = MVT::i16;
21505 ShiftAmt = 16;
21506 break;
21507 case (1ULL << 31) - 1:
21508 ScalarType = MVT::i32;
21509 ShiftAmt = 32;
21510 break;
21511 default:
21512 return SDValue();
21513 }
21514
21515 SDValue Sra = N->getOperand(0);
21516 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21517 return SDValue();
21518
21519 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21520 if (!RightShiftVec)
21521 return SDValue();
21522 unsigned SExtValue = RightShiftVec->getSExtValue();
21523
21524 if (SExtValue != (ShiftAmt - 1))
21525 return SDValue();
21526
21527 SDValue Mul = Sra.getOperand(0);
21528 if (Mul.getOpcode() != ISD::MUL)
21529 return SDValue();
21530
21531 SDValue SExt0 = Mul.getOperand(0);
21532 SDValue SExt1 = Mul.getOperand(1);
21533
21534 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21535 SExt1.getOpcode() != ISD::SIGN_EXTEND)
21536 return SDValue();
21537
21538 EVT SExt0Type = SExt0.getOperand(0).getValueType();
21539 EVT SExt1Type = SExt1.getOperand(0).getValueType();
21540
21541 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21542 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21543 SExt0Type.getVectorNumElements() == 1)
21544 return SDValue();
21545
21546 SDLoc DL(N);
21547 SDValue V0 = SExt0.getOperand(0);
21548 SDValue V1 = SExt1.getOperand(0);
21549
21550 // Ensure input vectors are extended to legal types
21551 if (SExt0Type.getFixedSizeInBits() < 64) {
21552 unsigned VecNumElements = SExt0Type.getVectorNumElements();
21553 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21554 VecNumElements);
21555 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21556 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21557 }
21558
21559 SDValue SQDMULH =
21560 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21561
21562 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21563}
21564
21566 if (SDValue V = trySQDMULHCombine(N, DAG)) {
21567 return V;
21568 }
21569
21570 return SDValue();
21571}
21572
21575 SDLoc DL(N);
21576 EVT VT = N->getValueType(0);
21577 SDValue N0 = N->getOperand(0);
21578 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
21579 N0.getOpcode() == AArch64ISD::DUP) {
21580 SDValue Op = N0.getOperand(0);
21581 if (VT.getScalarType() == MVT::i32 &&
21582 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
21583 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
21584 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
21585 }
21586
21587 // Performing the following combine produces a preferable form for ISEL.
21588 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
21590 N0.hasOneUse()) {
21591 SDValue Op = N0.getOperand(0);
21592 SDValue ExtractIndexNode = N0.getOperand(1);
21593 if (!isa<ConstantSDNode>(ExtractIndexNode))
21594 return SDValue();
21595
21596 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
21597 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
21598 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
21599 "Unexpected legalisation result!");
21600
21601 EVT SrcVectorType = Op.getValueType();
21602 // We also assume that SrcVectorType cannot be a V64 (see
21603 // LowerEXTRACT_VECTOR_ELT).
21604 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
21605 "Unexpected legalisation result!");
21606
21607 unsigned ExtractIndex =
21608 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
21609 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
21610
21611 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
21612 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
21613 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
21614 }
21615
21616 return SDValue();
21617}
21618
21619// Check an node is an extend or shift operand
21621 unsigned Opcode = N.getOpcode();
21622 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
21623 EVT SrcVT;
21624 if (Opcode == ISD::SIGN_EXTEND_INREG)
21625 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
21626 else
21627 SrcVT = N.getOperand(0).getValueType();
21628
21629 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
21630 } else if (Opcode == ISD::AND) {
21631 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
21632 if (!CSD)
21633 return false;
21634 uint64_t AndMask = CSD->getZExtValue();
21635 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
21636 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
21637 return isa<ConstantSDNode>(N.getOperand(1));
21638 }
21639
21640 return false;
21641}
21642
21643// (N - Y) + Z --> (Z - Y) + N
21644// when N is an extend or shift operand
21646 SelectionDAG &DAG) {
21647 auto IsOneUseExtend = [](SDValue N) {
21648 return N.hasOneUse() && isExtendOrShiftOperand(N);
21649 };
21650
21651 // DAGCombiner will revert the combination when Z is constant cause
21652 // dead loop. So don't enable the combination when Z is constant.
21653 // If Z is one use shift C, we also can't do the optimization.
21654 // It will falling to self infinite loop.
21655 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
21656 return SDValue();
21657
21658 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21659 return SDValue();
21660
21661 SDValue Shift = SUB.getOperand(0);
21662 if (!IsOneUseExtend(Shift))
21663 return SDValue();
21664
21665 SDLoc DL(N);
21666 EVT VT = N->getValueType(0);
21667
21668 SDValue Y = SUB.getOperand(1);
21669 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
21670 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
21671}
21672
21674 SelectionDAG &DAG) {
21675 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21676 // commutative.
21677 if (N->getOpcode() != ISD::ADD)
21678 return SDValue();
21679
21680 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21681 // shifted register is only available for i32 and i64.
21682 EVT VT = N->getValueType(0);
21683 if (VT != MVT::i32 && VT != MVT::i64)
21684 return SDValue();
21685
21686 SDLoc DL(N);
21687 SDValue LHS = N->getOperand(0);
21688 SDValue RHS = N->getOperand(1);
21689
21690 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
21691 return Val;
21692 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
21693 return Val;
21694
21695 uint64_t LHSImm = 0, RHSImm = 0;
21696 // If both operand are shifted by imm and shift amount is not greater than 4
21697 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21698 // on RHS.
21699 //
21700 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21701 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21702 // with LSL (shift > 4). For the rest of processors, this is no-op for
21703 // performance or correctness.
21704 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21705 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21706 RHSImm > 4 && LHS.hasOneUse())
21707 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21708
21709 return SDValue();
21710}
21711
21712// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21713// This reassociates it back to allow the creation of more mls instructions.
21715 if (N->getOpcode() != ISD::SUB)
21716 return SDValue();
21717
21718 SDValue Add = N->getOperand(1);
21719 SDValue X = N->getOperand(0);
21720 if (Add.getOpcode() != ISD::ADD)
21721 return SDValue();
21722
21723 if (!Add.hasOneUse())
21724 return SDValue();
21726 return SDValue();
21727
21728 SDValue M1 = Add.getOperand(0);
21729 SDValue M2 = Add.getOperand(1);
21730 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21731 M1.getOpcode() != AArch64ISD::UMULL)
21732 return SDValue();
21733 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21734 M2.getOpcode() != AArch64ISD::UMULL)
21735 return SDValue();
21736
21737 EVT VT = N->getValueType(0);
21738 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21739 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21740}
21741
21742// Combine into mla/mls.
21743// This works on the patterns of:
21744// add v1, (mul v2, v3)
21745// sub v1, (mul v2, v3)
21746// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21747// It will transform the add/sub to a scalable version, so that we can
21748// make use of SVE's MLA/MLS that will be generated for that pattern
21749static SDValue
21751 SelectionDAG &DAG = DCI.DAG;
21752 // Make sure that the types are legal
21753 if (!DCI.isAfterLegalizeDAG())
21754 return SDValue();
21755 // Before using SVE's features, check first if it's available.
21756 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21757 return SDValue();
21758
21759 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21760 return SDValue();
21761
21762 if (!N->getValueType(0).isFixedLengthVector())
21763 return SDValue();
21764
21765 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21766 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21767 return SDValue();
21768
21769 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21770 return SDValue();
21771
21772 SDValue MulValue = Op1->getOperand(0);
21773 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21774 return SDValue();
21775
21776 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21777 return SDValue();
21778
21779 EVT ScalableVT = MulValue.getValueType();
21780 if (!ScalableVT.isScalableVector())
21781 return SDValue();
21782
21783 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21784 SDValue NewValue =
21785 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21786 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21787 };
21788
21789 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21790 return res;
21791 else if (N->getOpcode() == ISD::ADD)
21792 return performOpt(N->getOperand(1), N->getOperand(0));
21793
21794 return SDValue();
21795}
21796
21797// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21798// help, for example, to produce ssra from sshr+add.
21800 EVT VT = N->getValueType(0);
21801 if (VT != MVT::i64 ||
21802 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21803 return SDValue();
21804 SDValue Op0 = N->getOperand(0);
21805 SDValue Op1 = N->getOperand(1);
21806
21807 // At least one of the operands should be an extract, and the other should be
21808 // something that is easy to convert to v1i64 type (in this case a load).
21809 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21810 Op0.getOpcode() != ISD::LOAD)
21811 return SDValue();
21812 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21813 Op1.getOpcode() != ISD::LOAD)
21814 return SDValue();
21815
21816 SDLoc DL(N);
21817 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21818 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21819 Op0 = Op0.getOperand(0);
21820 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21821 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21822 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21823 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21824 Op1 = Op1.getOperand(0);
21825 } else
21826 return SDValue();
21827
21828 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21829 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21830 DAG.getConstant(0, DL, MVT::i64));
21831}
21832
21835 if (!BV->hasOneUse())
21836 return false;
21837 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21838 if (!Ld || !Ld->isSimple())
21839 return false;
21840 Loads.push_back(Ld);
21841 return true;
21842 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21844 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21845 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21846 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21847 return false;
21848 Loads.push_back(Ld);
21849 }
21850 return true;
21851 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21852 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21853 // are lowered. Note that this only comes up because we do not always visit
21854 // operands before uses. After that is fixed this can be removed and in the
21855 // meantime this is fairly specific to the lowering we expect from IR.
21856 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21857 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21858 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21859 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21860 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21861 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21862 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21863 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21864 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21865 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21866 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21867 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21868 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21869 B.getOperand(1).getNumOperands() != 4)
21870 return false;
21871 auto SV1 = cast<ShuffleVectorSDNode>(B);
21872 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21873 int NumElts = B.getValueType().getVectorNumElements();
21874 int NumSubElts = NumElts / 4;
21875 for (int I = 0; I < NumSubElts; I++) {
21876 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21877 if (SV1->getMaskElt(I) != I ||
21878 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21879 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21880 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21881 return false;
21882 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21883 if (SV2->getMaskElt(I) != I ||
21884 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21885 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21886 return false;
21887 }
21888 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21889 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21890 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21891 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21892 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21893 !Ld2->isSimple() || !Ld3->isSimple())
21894 return false;
21895 Loads.push_back(Ld0);
21896 Loads.push_back(Ld1);
21897 Loads.push_back(Ld2);
21898 Loads.push_back(Ld3);
21899 return true;
21900 }
21901 return false;
21902}
21903
21905 SelectionDAG &DAG,
21906 unsigned &NumSubLoads) {
21907 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21908 return false;
21909
21910 SmallVector<LoadSDNode *> Loads0, Loads1;
21911 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21912 isLoadOrMultipleLoads(Op1, Loads1)) {
21913 if (NumSubLoads && Loads0.size() != NumSubLoads)
21914 return false;
21915 NumSubLoads = Loads0.size();
21916 return Loads0.size() == Loads1.size() &&
21917 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21918 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21919 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21921 Size / 8, 1);
21922 });
21923 }
21924
21925 if (Op0.getOpcode() != Op1.getOpcode())
21926 return false;
21927
21928 switch (Op0.getOpcode()) {
21929 case ISD::ADD:
21930 case ISD::SUB:
21932 DAG, NumSubLoads) &&
21934 DAG, NumSubLoads);
21935 case ISD::SIGN_EXTEND:
21936 case ISD::ANY_EXTEND:
21937 case ISD::ZERO_EXTEND:
21938 EVT XVT = Op0.getOperand(0).getValueType();
21939 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21940 XVT.getScalarSizeInBits() != 32)
21941 return false;
21943 DAG, NumSubLoads);
21944 }
21945 return false;
21946}
21947
21948// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21949// into a single load of twice the size, that we extract the bottom part and top
21950// part so that the shl can use a shll2 instruction. The two loads in that
21951// example can also be larger trees of instructions, which are identical except
21952// for the leaves which are all loads offset from the LHS, including
21953// buildvectors of multiple loads. For example the RHS tree could be
21954// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21955// Whilst it can be common for the larger loads to replace LDP instructions
21956// (which doesn't gain anything on it's own), the larger loads can help create
21957// more efficient code, and in buildvectors prevent the need for ld1 lane
21958// inserts which can be slower than normal loads.
21960 EVT VT = N->getValueType(0);
21961 if (!VT.isFixedLengthVector() ||
21962 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21963 VT.getScalarSizeInBits() != 64))
21964 return SDValue();
21965
21966 SDValue Other = N->getOperand(0);
21967 SDValue Shift = N->getOperand(1);
21968 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21969 std::swap(Shift, Other);
21970 APInt ShiftAmt;
21971 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21972 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
21973 return SDValue();
21974
21975 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
21976 !ISD::isExtOpcode(Other.getOpcode()) ||
21977 Shift.getOperand(0).getOperand(0).getValueType() !=
21978 Other.getOperand(0).getValueType() ||
21979 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
21980 return SDValue();
21981
21982 SDValue Op0 = Other.getOperand(0);
21983 SDValue Op1 = Shift.getOperand(0).getOperand(0);
21984
21985 unsigned NumSubLoads = 0;
21986 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21987 return SDValue();
21988
21989 // Attempt to rule out some unprofitable cases using heuristics (some working
21990 // around suboptimal code generation), notably if the extend not be able to
21991 // use ushll2 instructions as the types are not large enough. Otherwise zip's
21992 // will need to be created which can increase the instruction count.
21993 unsigned NumElts = Op0.getValueType().getVectorNumElements();
21994 unsigned NumSubElts = NumElts / NumSubLoads;
21995 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21996 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
21997 Op0.getValueType().getSizeInBits() < 128 &&
21999 return SDValue();
22000
22001 // Recreate the tree with the new combined loads.
22002 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
22003 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
22004 EVT DVT =
22006
22007 SmallVector<LoadSDNode *> Loads0, Loads1;
22008 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22009 isLoadOrMultipleLoads(Op1, Loads1)) {
22010 EVT LoadVT = EVT::getVectorVT(
22011 *DAG.getContext(), Op0.getValueType().getScalarType(),
22012 Op0.getValueType().getVectorNumElements() / Loads0.size());
22013 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22014
22015 SmallVector<SDValue> NewLoads;
22016 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
22017 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
22018 L0->getBasePtr(), L0->getPointerInfo(),
22019 L0->getBaseAlign());
22020 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
22021 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
22022 NewLoads.push_back(Load);
22023 }
22024 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
22025 }
22026
22028 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
22029 Ops.push_back(GenCombinedTree(O0, O1, DAG));
22030 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
22031 };
22032 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
22033
22034 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
22035 int Hi = NumSubElts, Lo = 0;
22036 for (unsigned i = 0; i < NumSubLoads; i++) {
22037 for (unsigned j = 0; j < NumSubElts; j++) {
22038 LowMask[i * NumSubElts + j] = Lo++;
22039 HighMask[i * NumSubElts + j] = Hi++;
22040 }
22041 Lo += NumSubElts;
22042 Hi += NumSubElts;
22043 }
22044 SDLoc DL(N);
22045 SDValue Ext0, Ext1;
22046 // Extract the top and bottom lanes, then extend the result. Possibly extend
22047 // the result then extract the lanes if the two operands match as it produces
22048 // slightly smaller code.
22049 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
22051 NewOp, DAG.getConstant(0, DL, MVT::i64));
22052 SDValue SubH =
22053 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
22054 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22055 SDValue Extr0 =
22056 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
22057 SDValue Extr1 =
22058 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
22059 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
22060 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
22061 } else {
22063 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
22064 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22065 DAG.getConstant(0, DL, MVT::i64));
22066 SDValue SubH =
22067 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22068 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22069 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
22070 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
22071 }
22072 SDValue NShift =
22073 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
22074 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
22075}
22076
22079 // Try to change sum of two reductions.
22080 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
22081 return Val;
22082 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
22083 return Val;
22084 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
22085 return Val;
22086 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
22087 return Val;
22088 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
22089 return Val;
22091 return Val;
22092 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
22093 return Val;
22094 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
22095 return Val;
22096 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
22097 return Val;
22098
22099 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
22100 return Val;
22101
22102 return performAddSubLongCombine(N, DCI);
22103}
22104
22105// Massage DAGs which we can use the high-half "long" operations on into
22106// something isel will recognize better. E.g.
22107//
22108// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
22109// (aarch64_neon_umull (extract_high (v2i64 vec)))
22110// (extract_high (v2i64 (dup128 scalar)))))
22111//
22114 SelectionDAG &DAG) {
22115 if (DCI.isBeforeLegalizeOps())
22116 return SDValue();
22117
22118 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
22119 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
22120 assert(LHS.getValueType().is64BitVector() &&
22121 RHS.getValueType().is64BitVector() &&
22122 "unexpected shape for long operation");
22123
22124 // Either node could be a DUP, but it's not worth doing both of them (you'd
22125 // just as well use the non-high version) so look for a corresponding extract
22126 // operation on the other "wing".
22129 if (!RHS.getNode())
22130 return SDValue();
22133 if (!LHS.getNode())
22134 return SDValue();
22135 } else
22136 return SDValue();
22137
22138 if (IID == Intrinsic::not_intrinsic)
22139 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
22140
22141 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
22142 N->getOperand(0), LHS, RHS);
22143}
22144
22145static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22146 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
22147 unsigned ElemBits = ElemTy.getSizeInBits();
22148
22149 int64_t ShiftAmount;
22150 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
22151 APInt SplatValue, SplatUndef;
22152 unsigned SplatBitSize;
22153 bool HasAnyUndefs;
22154 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
22155 HasAnyUndefs, ElemBits) ||
22156 SplatBitSize != ElemBits)
22157 return SDValue();
22158
22159 ShiftAmount = SplatValue.getSExtValue();
22160 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22161 ShiftAmount = CVN->getSExtValue();
22162 } else
22163 return SDValue();
22164
22165 // If the shift amount is zero, remove the shift intrinsic.
22166 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22167 return N->getOperand(1);
22168
22169 unsigned Opcode;
22170 bool IsRightShift;
22171 switch (IID) {
22172 default:
22173 llvm_unreachable("Unknown shift intrinsic");
22174 case Intrinsic::aarch64_neon_sqshl:
22175 Opcode = AArch64ISD::SQSHL_I;
22176 IsRightShift = false;
22177 break;
22178 case Intrinsic::aarch64_neon_uqshl:
22179 Opcode = AArch64ISD::UQSHL_I;
22180 IsRightShift = false;
22181 break;
22182 case Intrinsic::aarch64_neon_srshl:
22183 Opcode = AArch64ISD::SRSHR_I;
22184 IsRightShift = true;
22185 break;
22186 case Intrinsic::aarch64_neon_urshl:
22187 Opcode = AArch64ISD::URSHR_I;
22188 IsRightShift = true;
22189 break;
22190 case Intrinsic::aarch64_neon_sqshlu:
22191 Opcode = AArch64ISD::SQSHLU_I;
22192 IsRightShift = false;
22193 break;
22194 case Intrinsic::aarch64_neon_sshl:
22195 case Intrinsic::aarch64_neon_ushl:
22196 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
22197 // left shift for positive shift amounts. For negative shifts we can use a
22198 // VASHR/VLSHR as appropriate.
22199 if (ShiftAmount < 0) {
22200 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
22201 : AArch64ISD::VLSHR;
22202 ShiftAmount = -ShiftAmount;
22203 } else
22204 Opcode = AArch64ISD::VSHL;
22205 IsRightShift = false;
22206 break;
22207 }
22208
22209 EVT VT = N->getValueType(0);
22210 SDValue Op = N->getOperand(1);
22211 SDLoc DL(N);
22212 if (VT == MVT::i64) {
22213 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
22214 VT = MVT::v1i64;
22215 }
22216
22217 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
22218 Op = DAG.getNode(Opcode, DL, VT, Op,
22219 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32));
22220 if (N->getValueType(0) == MVT::i64)
22221 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22222 DAG.getConstant(0, DL, MVT::i64));
22223 return Op;
22224 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
22225 Op = DAG.getNode(Opcode, DL, VT, Op,
22226 DAG.getConstant(ShiftAmount, DL, MVT::i32));
22227 if (N->getValueType(0) == MVT::i64)
22228 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22229 DAG.getConstant(0, DL, MVT::i64));
22230 return Op;
22231 }
22232
22233 return SDValue();
22234}
22235
22236// The CRC32[BH] instructions ignore the high bits of their data operand. Since
22237// the intrinsics must be legal and take an i32, this means there's almost
22238// certainly going to be a zext in the DAG which we can eliminate.
22239static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
22240 SDValue AndN = N->getOperand(2);
22241 if (AndN.getOpcode() != ISD::AND)
22242 return SDValue();
22243
22245 if (!CMask || CMask->getZExtValue() != Mask)
22246 return SDValue();
22247
22248 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
22249 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
22250}
22251
22253 SelectionDAG &DAG) {
22254 SDLoc DL(N);
22255 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
22256 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
22257 N->getOperand(1)),
22258 DAG.getConstant(0, DL, MVT::i64));
22259}
22260
22262 SDLoc DL(N);
22263 SDValue Op1 = N->getOperand(1);
22264 SDValue Op2 = N->getOperand(2);
22265 EVT ScalarTy = Op2.getValueType();
22266 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22267 ScalarTy = MVT::i32;
22268
22269 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
22270 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
22271 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
22272 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
22273 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
22274 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
22275}
22276
22278 SDLoc DL(N);
22279 SDValue Scalar = N->getOperand(3);
22280 EVT ScalarTy = Scalar.getValueType();
22281
22282 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22283 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
22284
22285 SDValue Passthru = N->getOperand(1);
22286 SDValue Pred = N->getOperand(2);
22287 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
22288 Pred, Scalar, Passthru);
22289}
22290
22292 SDLoc DL(N);
22293 LLVMContext &Ctx = *DAG.getContext();
22294 EVT VT = N->getValueType(0);
22295
22296 assert(VT.isScalableVector() && "Expected a scalable vector.");
22297
22298 // Current lowering only supports the SVE-ACLE types.
22300 return SDValue();
22301
22302 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
22303 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
22304 EVT ByteVT =
22305 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
22306
22307 // Convert everything to the domain of EXT (i.e bytes).
22308 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
22309 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
22310 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
22311 DAG.getConstant(ElemSize, DL, MVT::i32));
22312
22313 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
22314 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
22315}
22316
22319 SelectionDAG &DAG) {
22320 if (DCI.isBeforeLegalize())
22321 return SDValue();
22322
22323 SDValue Comparator = N->getOperand(3);
22324 if (Comparator.getOpcode() == AArch64ISD::DUP ||
22325 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
22326 unsigned IID = getIntrinsicID(N);
22327 EVT VT = N->getValueType(0);
22328 EVT CmpVT = N->getOperand(2).getValueType();
22329 SDValue Pred = N->getOperand(1);
22330 SDValue Imm;
22331 SDLoc DL(N);
22332
22333 switch (IID) {
22334 default:
22335 llvm_unreachable("Called with wrong intrinsic!");
22336 break;
22337
22338 // Signed comparisons
22339 case Intrinsic::aarch64_sve_cmpeq_wide:
22340 case Intrinsic::aarch64_sve_cmpne_wide:
22341 case Intrinsic::aarch64_sve_cmpge_wide:
22342 case Intrinsic::aarch64_sve_cmpgt_wide:
22343 case Intrinsic::aarch64_sve_cmplt_wide:
22344 case Intrinsic::aarch64_sve_cmple_wide: {
22345 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22346 int64_t ImmVal = CN->getSExtValue();
22347 if (ImmVal >= -16 && ImmVal <= 15)
22348 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
22349 else
22350 return SDValue();
22351 }
22352 break;
22353 }
22354 // Unsigned comparisons
22355 case Intrinsic::aarch64_sve_cmphs_wide:
22356 case Intrinsic::aarch64_sve_cmphi_wide:
22357 case Intrinsic::aarch64_sve_cmplo_wide:
22358 case Intrinsic::aarch64_sve_cmpls_wide: {
22359 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22360 uint64_t ImmVal = CN->getZExtValue();
22361 if (ImmVal <= 127)
22362 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
22363 else
22364 return SDValue();
22365 }
22366 break;
22367 }
22368 }
22369
22370 if (!Imm)
22371 return SDValue();
22372
22373 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
22374 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
22375 N->getOperand(2), Splat, DAG.getCondCode(CC));
22376 }
22377
22378 return SDValue();
22379}
22380
22383 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22384
22385 SDLoc DL(Op);
22386 assert(Op.getValueType().isScalableVector() &&
22387 TLI.isTypeLegal(Op.getValueType()) &&
22388 "Expected legal scalable vector type!");
22389 assert(Op.getValueType() == Pg.getValueType() &&
22390 "Expected same type for PTEST operands");
22391
22392 // Ensure target specific opcodes are using legal type.
22393 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
22394 SDValue TVal = DAG.getConstant(1, DL, OutVT);
22395 SDValue FVal = DAG.getConstant(0, DL, OutVT);
22396
22397 // Ensure operands have type nxv16i1.
22398 if (Op.getValueType() != MVT::nxv16i1) {
22401 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
22402 else
22403 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
22404 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
22405 }
22406
22407 unsigned PTest = AArch64ISD::PTEST;
22409 PTest = AArch64ISD::PTEST_ANY;
22410 else if (Cond == AArch64CC::FIRST_ACTIVE)
22411 PTest = AArch64ISD::PTEST_FIRST;
22412
22413 // Set condition code (CC) flags.
22414 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
22415
22416 // Convert CC to integer based on requested condition.
22417 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
22418 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
22419 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
22420 return DAG.getZExtOrTrunc(Res, DL, VT);
22421}
22422
22424 SelectionDAG &DAG) {
22425 SDLoc DL(N);
22426
22427 SDValue Pred = N->getOperand(1);
22428 SDValue VecToReduce = N->getOperand(2);
22429
22430 // NOTE: The integer reduction's result type is not always linked to the
22431 // operand's element type so we construct it from the intrinsic's result type.
22432 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
22433 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22434
22435 // SVE reductions set the whole vector register with the first element
22436 // containing the reduction result, which we'll now extract.
22437 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22438 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22439 Zero);
22440}
22441
22443 SelectionDAG &DAG) {
22444 SDLoc DL(N);
22445
22446 SDValue Pred = N->getOperand(1);
22447 SDValue VecToReduce = N->getOperand(2);
22448
22449 EVT ReduceVT = VecToReduce.getValueType();
22450 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22451
22452 // SVE reductions set the whole vector register with the first element
22453 // containing the reduction result, which we'll now extract.
22454 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22455 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22456 Zero);
22457}
22458
22460 SelectionDAG &DAG) {
22461 SDLoc DL(N);
22462
22463 SDValue Pred = N->getOperand(1);
22464 SDValue InitVal = N->getOperand(2);
22465 SDValue VecToReduce = N->getOperand(3);
22466 EVT ReduceVT = VecToReduce.getValueType();
22467
22468 // Ordered reductions use the first lane of the result vector as the
22469 // reduction's initial value.
22470 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22471 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
22472 DAG.getUNDEF(ReduceVT), InitVal, Zero);
22473
22474 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
22475
22476 // SVE reductions set the whole vector register with the first element
22477 // containing the reduction result, which we'll now extract.
22478 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22479 Zero);
22480}
22481
22483 SelectionDAG &DAG) {
22484 if (N->getValueType(0) != MVT::i16)
22485 return SDValue();
22486
22487 SDLoc DL(N);
22488 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
22489 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
22490 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
22491}
22492
22493// If a merged operation has no inactive lanes we can relax it to a predicated
22494// or unpredicated operation, which potentially allows better isel (perhaps
22495// using immediate forms) or relaxing register reuse requirements.
22497 SelectionDAG &DAG, bool UnpredOp = false,
22498 bool SwapOperands = false) {
22499 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
22500 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
22501 SDValue Pg = N->getOperand(1);
22502 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
22503 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
22504
22505 // ISD way to specify an all active predicate.
22506 if (isAllActivePredicate(DAG, Pg)) {
22507 if (UnpredOp)
22508 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
22509
22510 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
22511 }
22512
22513 // FUTURE: SplatVector(true)
22514 return SDValue();
22515}
22516
22517static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22518 SDLoc DL(N);
22519 EVT VT = N->getValueType(0);
22520 SDValue Op1 = N->getOperand(1);
22521 SDValue Op2 = N->getOperand(2);
22522 SDValue Op3 = N->getOperand(3);
22523
22524 switch (IID) {
22525 default:
22526 llvm_unreachable("Called with wrong intrinsic!");
22527 case Intrinsic::aarch64_sve_bsl:
22528 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
22529 case Intrinsic::aarch64_sve_bsl1n:
22530 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
22531 Op2);
22532 case Intrinsic::aarch64_sve_bsl2n:
22533 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
22534 DAG.getNOT(DL, Op2, VT));
22535 case Intrinsic::aarch64_sve_nbsl:
22536 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
22537 VT);
22538 }
22539}
22540
22543 const AArch64Subtarget *Subtarget) {
22544 SelectionDAG &DAG = DCI.DAG;
22545 unsigned IID = getIntrinsicID(N);
22546 switch (IID) {
22547 default:
22548 break;
22549 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22550 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22551 return tryCombineFixedPointConvert(N, DCI, DAG);
22552 case Intrinsic::aarch64_neon_saddv:
22553 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
22554 case Intrinsic::aarch64_neon_uaddv:
22555 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
22556 case Intrinsic::aarch64_neon_sminv:
22557 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
22558 case Intrinsic::aarch64_neon_uminv:
22559 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
22560 case Intrinsic::aarch64_neon_smaxv:
22561 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
22562 case Intrinsic::aarch64_neon_umaxv:
22563 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
22564 case Intrinsic::aarch64_neon_fmax:
22565 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22566 N->getOperand(1), N->getOperand(2));
22567 case Intrinsic::aarch64_neon_fmin:
22568 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22569 N->getOperand(1), N->getOperand(2));
22570 case Intrinsic::aarch64_neon_fmaxnm:
22571 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22572 N->getOperand(1), N->getOperand(2));
22573 case Intrinsic::aarch64_neon_fminnm:
22574 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22575 N->getOperand(1), N->getOperand(2));
22576 case Intrinsic::aarch64_neon_smull:
22577 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22578 N->getOperand(1), N->getOperand(2));
22579 case Intrinsic::aarch64_neon_umull:
22580 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22581 N->getOperand(1), N->getOperand(2));
22582 case Intrinsic::aarch64_neon_pmull:
22583 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22584 N->getOperand(1), N->getOperand(2));
22585 case Intrinsic::aarch64_neon_sqdmull:
22586 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22587 case Intrinsic::aarch64_neon_sqshl:
22588 case Intrinsic::aarch64_neon_uqshl:
22589 case Intrinsic::aarch64_neon_sqshlu:
22590 case Intrinsic::aarch64_neon_srshl:
22591 case Intrinsic::aarch64_neon_urshl:
22592 case Intrinsic::aarch64_neon_sshl:
22593 case Intrinsic::aarch64_neon_ushl:
22594 return tryCombineShiftImm(IID, N, DAG);
22595 case Intrinsic::aarch64_neon_sabd:
22596 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22597 N->getOperand(1), N->getOperand(2));
22598 case Intrinsic::aarch64_neon_uabd:
22599 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22600 N->getOperand(1), N->getOperand(2));
22601 case Intrinsic::aarch64_neon_fcvtzs:
22602 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
22603 case Intrinsic::aarch64_neon_fcvtzu:
22604 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
22605 case Intrinsic::aarch64_neon_fcvtas:
22606 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
22607 case Intrinsic::aarch64_neon_fcvtau:
22608 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
22609 case Intrinsic::aarch64_neon_fcvtms:
22610 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
22611 case Intrinsic::aarch64_neon_fcvtmu:
22612 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
22613 case Intrinsic::aarch64_neon_fcvtns:
22614 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
22615 case Intrinsic::aarch64_neon_fcvtnu:
22616 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
22617 case Intrinsic::aarch64_neon_fcvtps:
22618 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
22619 case Intrinsic::aarch64_neon_fcvtpu:
22620 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
22621 case Intrinsic::aarch64_crc32b:
22622 case Intrinsic::aarch64_crc32cb:
22623 return tryCombineCRC32(0xff, N, DAG);
22624 case Intrinsic::aarch64_crc32h:
22625 case Intrinsic::aarch64_crc32ch:
22626 return tryCombineCRC32(0xffff, N, DAG);
22627 case Intrinsic::aarch64_sve_saddv:
22628 // There is no i64 version of SADDV because the sign is irrelevant.
22629 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
22630 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22631 else
22632 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22633 case Intrinsic::aarch64_sve_uaddv:
22634 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22635 case Intrinsic::aarch64_sve_smaxv:
22636 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22637 case Intrinsic::aarch64_sve_umaxv:
22638 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22639 case Intrinsic::aarch64_sve_sminv:
22640 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22641 case Intrinsic::aarch64_sve_uminv:
22642 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22643 case Intrinsic::aarch64_sve_orv:
22644 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22645 case Intrinsic::aarch64_sve_eorv:
22646 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22647 case Intrinsic::aarch64_sve_andv:
22648 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22649 case Intrinsic::aarch64_sve_index:
22650 return LowerSVEIntrinsicIndex(N, DAG);
22651 case Intrinsic::aarch64_sve_dup:
22652 return LowerSVEIntrinsicDUP(N, DAG);
22653 case Intrinsic::aarch64_sve_dup_x:
22654 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22655 N->getOperand(1));
22656 case Intrinsic::aarch64_sve_ext:
22657 return LowerSVEIntrinsicEXT(N, DAG);
22658 case Intrinsic::aarch64_sve_mul_u:
22659 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22660 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22661 case Intrinsic::aarch64_sve_smulh_u:
22662 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22663 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22664 case Intrinsic::aarch64_sve_umulh_u:
22665 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22666 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22667 case Intrinsic::aarch64_sve_smin_u:
22668 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22669 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22670 case Intrinsic::aarch64_sve_umin_u:
22671 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22672 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22673 case Intrinsic::aarch64_sve_smax_u:
22674 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22675 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22676 case Intrinsic::aarch64_sve_umax_u:
22677 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22678 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22679 case Intrinsic::aarch64_sve_lsl_u:
22680 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22681 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22682 case Intrinsic::aarch64_sve_lsr_u:
22683 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22684 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22685 case Intrinsic::aarch64_sve_asr_u:
22686 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22687 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22688 case Intrinsic::aarch64_sve_fadd_u:
22689 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22690 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22691 case Intrinsic::aarch64_sve_fdiv_u:
22692 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22693 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22694 case Intrinsic::aarch64_sve_fmax_u:
22695 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22696 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22697 case Intrinsic::aarch64_sve_fmaxnm_u:
22698 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22699 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22700 case Intrinsic::aarch64_sve_fmla_u:
22701 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22702 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22703 N->getOperand(2));
22704 case Intrinsic::aarch64_sve_fmin_u:
22705 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22706 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22707 case Intrinsic::aarch64_sve_fminnm_u:
22708 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22709 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22710 case Intrinsic::aarch64_sve_fmul_u:
22711 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22712 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22713 case Intrinsic::aarch64_sve_fsub_u:
22714 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22715 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22716 case Intrinsic::aarch64_sve_add_u:
22717 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22718 N->getOperand(3));
22719 case Intrinsic::aarch64_sve_sub_u:
22720 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22721 N->getOperand(3));
22722 case Intrinsic::aarch64_sve_subr:
22723 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22724 case Intrinsic::aarch64_sve_and_u:
22725 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22726 N->getOperand(3));
22727 case Intrinsic::aarch64_sve_bic_u:
22728 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22729 N->getOperand(2), N->getOperand(3));
22730 case Intrinsic::aarch64_sve_saddwb:
22731 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22732 N->getOperand(1), N->getOperand(2));
22733 case Intrinsic::aarch64_sve_saddwt:
22734 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22735 N->getOperand(1), N->getOperand(2));
22736 case Intrinsic::aarch64_sve_uaddwb:
22737 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22738 N->getOperand(1), N->getOperand(2));
22739 case Intrinsic::aarch64_sve_uaddwt:
22740 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22741 N->getOperand(1), N->getOperand(2));
22742 case Intrinsic::aarch64_sve_eor_u:
22743 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22744 N->getOperand(3));
22745 case Intrinsic::aarch64_sve_orr_u:
22746 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22747 N->getOperand(3));
22748 case Intrinsic::aarch64_sve_sabd_u:
22749 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22750 N->getOperand(2), N->getOperand(3));
22751 case Intrinsic::aarch64_sve_uabd_u:
22752 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22753 N->getOperand(2), N->getOperand(3));
22754 case Intrinsic::aarch64_sve_sdiv_u:
22755 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22756 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22757 case Intrinsic::aarch64_sve_udiv_u:
22758 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22759 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22760 case Intrinsic::aarch64_sve_sqadd:
22761 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22762 case Intrinsic::aarch64_sve_sqsub_u:
22763 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22764 N->getOperand(2), N->getOperand(3));
22765 case Intrinsic::aarch64_sve_uqadd:
22766 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22767 case Intrinsic::aarch64_sve_uqsub_u:
22768 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22769 N->getOperand(2), N->getOperand(3));
22770 case Intrinsic::aarch64_sve_sqadd_x:
22771 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22772 N->getOperand(1), N->getOperand(2));
22773 case Intrinsic::aarch64_sve_sqsub_x:
22774 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22775 N->getOperand(1), N->getOperand(2));
22776 case Intrinsic::aarch64_sve_uqadd_x:
22777 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22778 N->getOperand(1), N->getOperand(2));
22779 case Intrinsic::aarch64_sve_uqsub_x:
22780 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22781 N->getOperand(1), N->getOperand(2));
22782 case Intrinsic::aarch64_sve_asrd:
22783 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22784 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22785 case Intrinsic::aarch64_sve_cmphs:
22786 if (!N->getOperand(2).getValueType().isFloatingPoint())
22787 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22788 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22789 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22790 break;
22791 case Intrinsic::aarch64_sve_cmphi:
22792 if (!N->getOperand(2).getValueType().isFloatingPoint())
22793 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22794 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22795 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22796 break;
22797 case Intrinsic::aarch64_sve_fcmpge:
22798 case Intrinsic::aarch64_sve_cmpge:
22799 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22800 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22801 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22802 break;
22803 case Intrinsic::aarch64_sve_fcmpgt:
22804 case Intrinsic::aarch64_sve_cmpgt:
22805 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22806 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22807 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22808 break;
22809 case Intrinsic::aarch64_sve_fcmpeq:
22810 case Intrinsic::aarch64_sve_cmpeq:
22811 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22812 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22813 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22814 break;
22815 case Intrinsic::aarch64_sve_fcmpne:
22816 case Intrinsic::aarch64_sve_cmpne:
22817 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22818 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22819 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22820 break;
22821 case Intrinsic::aarch64_sve_fcmpuo:
22822 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22823 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22824 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22825 break;
22826 case Intrinsic::aarch64_sve_fadda:
22827 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22828 case Intrinsic::aarch64_sve_faddv:
22829 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22830 case Intrinsic::aarch64_sve_fmaxnmv:
22831 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22832 case Intrinsic::aarch64_sve_fmaxv:
22833 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22834 case Intrinsic::aarch64_sve_fminnmv:
22835 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22836 case Intrinsic::aarch64_sve_fminv:
22837 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22838 case Intrinsic::aarch64_sve_sel:
22839 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22840 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22841 case Intrinsic::aarch64_sve_cmpeq_wide:
22842 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22843 case Intrinsic::aarch64_sve_cmpne_wide:
22844 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22845 case Intrinsic::aarch64_sve_cmpge_wide:
22846 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22847 case Intrinsic::aarch64_sve_cmpgt_wide:
22848 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22849 case Intrinsic::aarch64_sve_cmplt_wide:
22850 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22851 case Intrinsic::aarch64_sve_cmple_wide:
22852 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22853 case Intrinsic::aarch64_sve_cmphs_wide:
22854 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22855 case Intrinsic::aarch64_sve_cmphi_wide:
22856 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22857 case Intrinsic::aarch64_sve_cmplo_wide:
22858 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22859 case Intrinsic::aarch64_sve_cmpls_wide:
22860 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22861 case Intrinsic::aarch64_sve_ptest_any:
22862 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22864 case Intrinsic::aarch64_sve_ptest_first:
22865 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22867 case Intrinsic::aarch64_sve_ptest_last:
22868 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22870 case Intrinsic::aarch64_sve_whilelo:
22871 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
22872 N->getOperand(1), N->getOperand(2));
22873 case Intrinsic::aarch64_sve_bsl:
22874 case Intrinsic::aarch64_sve_bsl1n:
22875 case Intrinsic::aarch64_sve_bsl2n:
22876 case Intrinsic::aarch64_sve_nbsl:
22877 return combineSVEBitSel(IID, N, DAG);
22878 }
22879 return SDValue();
22880}
22881
22882static bool isCheapToExtend(const SDValue &N) {
22883 unsigned OC = N->getOpcode();
22884 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22886}
22887
22888static SDValue
22890 SelectionDAG &DAG) {
22891 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22892 // we can move the sext into the arguments and have the same result. For
22893 // example, if A and B are both loads, we can make those extending loads and
22894 // avoid an extra instruction. This pattern appears often in VLS code
22895 // generation where the inputs to the setcc have a different size to the
22896 // instruction that wants to use the result of the setcc.
22897 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22898 N->getOperand(0)->getOpcode() == ISD::SETCC);
22899 const SDValue SetCC = N->getOperand(0);
22900
22901 const SDValue CCOp0 = SetCC.getOperand(0);
22902 const SDValue CCOp1 = SetCC.getOperand(1);
22903 if (!CCOp0->getValueType(0).isInteger() ||
22904 !CCOp1->getValueType(0).isInteger())
22905 return SDValue();
22906
22907 ISD::CondCode Code =
22908 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22909
22910 ISD::NodeType ExtType =
22911 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22912
22913 if (isCheapToExtend(SetCC.getOperand(0)) &&
22914 isCheapToExtend(SetCC.getOperand(1))) {
22915 const SDValue Ext1 =
22916 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22917 const SDValue Ext2 =
22918 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
22919
22920 return DAG.getSetCC(
22921 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
22922 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
22923 }
22924
22925 return SDValue();
22926}
22927
22928// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22929// This comes from interleaved vectorization. It is performed late to capture
22930// uitofp converts too.
22932 SelectionDAG &DAG) {
22933 EVT VT = N->getValueType(0);
22934 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22935 N->getOpcode() != ISD::ZERO_EXTEND ||
22936 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22937 return SDValue();
22938
22939 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
22940 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22941 return SDValue();
22942
22943 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
22944 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
22945 if (!Shuffle ||
22946 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22947 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22948 return SDValue();
22949
22950 unsigned Idx;
22952 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
22953 // An undef interleave shuffle can come up after other canonicalizations,
22954 // where the shuffle has been converted to
22955 // zext(extract(shuffle b, undef, [u,u,0,4]))
22956 bool IsUndefDeInterleave = false;
22957 if (!IsDeInterleave)
22958 IsUndefDeInterleave =
22959 Shuffle->getOperand(1).isUndef() &&
22960 all_of(
22961 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
22962 [](int M) { return M < 0; }) &&
22964 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
22965 VT.getVectorNumElements() / 2),
22966 4, Idx);
22967 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22968 return SDValue();
22969 SDLoc DL(N);
22970 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22971 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
22972 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22973 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
22974 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
22975 VT, BC1, BC2);
22976 if ((Idx & 1) == 1)
22977 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
22978 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
22979 return DAG.getNode(
22980 ISD::AND, DL, VT, UZP,
22981 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22982}
22983
22984// This comes up similar to the above when lowering deinterleaving shuffles from
22985// zexts. We have legalized the operations in the generally case to
22986// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22987// the extract is to the low half and the uzp is uzp1. There would be an extra
22988// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22989// there could also be an existing and / shift that can be combined in, either
22990// before of after the extract.
22992 EVT VT = N->getValueType(0);
22993 if (N->getOpcode() != ISD::ZERO_EXTEND ||
22994 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22995 return SDValue();
22996
22997 SDValue Op = N->getOperand(0);
22998 unsigned ExtOffset = (unsigned)-1;
22999 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23000 ExtOffset = Op.getConstantOperandVal(1);
23001 Op = Op.getOperand(0);
23002 }
23003
23004 unsigned Shift = 0;
23006 Op.getValueType().getScalarSizeInBits());
23007
23008 if (Op.getOpcode() == AArch64ISD::VLSHR) {
23009 Shift = Op.getConstantOperandVal(1);
23010 Op = Op.getOperand(0);
23011 Mask = Mask.lshr(Shift);
23012 }
23013 if (Op.getOpcode() == ISD::AND &&
23014 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
23015 Op = Op.getOperand(0);
23016 Mask = Mask.zext(VT.getScalarSizeInBits());
23017 } else if (Op.getOpcode() == AArch64ISD::BICi) {
23018 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
23019 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23020 Mask = Mask.zext(VT.getScalarSizeInBits());
23021 Op = Op.getOperand(0);
23022 }
23023
23024 if (ExtOffset == (unsigned)-1) {
23025 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23026 ExtOffset = Op.getConstantOperandVal(1);
23027 Op = Op.getOperand(0);
23028 } else
23029 return SDValue();
23030 }
23031 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23032 return SDValue();
23033
23034 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23035 return SDValue();
23036 if (Op.getOpcode() == AArch64ISD::UZP2)
23037 Shift += VT.getScalarSizeInBits() / 2;
23038
23039 SDLoc DL(N);
23040 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23041 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23042 if (Shift != 0)
23043 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23044 DAG.getConstant(Shift, DL, MVT::i32));
23045 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23046}
23047
23050 SelectionDAG &DAG) {
23051 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23052 // we can convert that DUP into another extract_high (of a bigger DUP), which
23053 // helps the backend to decide that an sabdl2 would be useful, saving a real
23054 // extract_high operation.
23055 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23056 N->getOperand(0).getValueType().is64BitVector() &&
23057 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23058 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23059 SDNode *ABDNode = N->getOperand(0).getNode();
23060 SDValue NewABD =
23062 if (!NewABD.getNode())
23063 return SDValue();
23064
23065 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23066 }
23067
23069 return R;
23070 if (SDValue R = performZExtUZPCombine(N, DAG))
23071 return R;
23072
23073 if (N->getValueType(0).isFixedLengthVector() &&
23074 N->getOpcode() == ISD::SIGN_EXTEND &&
23075 N->getOperand(0)->getOpcode() == ISD::SETCC)
23076 return performSignExtendSetCCCombine(N, DCI, DAG);
23077
23078 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23079 // that the top half of the result register must be unused, due to the
23080 // any_extend. This means that we can replace this pattern with (rev16
23081 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23082 // ...)), which is what this pattern would otherwise be lowered to.
23083 // Only apply this optimisation if any_extend in original pattern to i32 or
23084 // i64, because this type will become the input type to REV16 in the new
23085 // pattern, so must be a legitimate REV16 input type.
23086 SDValue Bswap = N->getOperand(0);
23087 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23088 Bswap.getValueType() == MVT::i16 &&
23089 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23090 SDLoc DL(N);
23091 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23092 Bswap->getOperand(0));
23093 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23094 NewAnyExtend);
23095 }
23096
23097 return SDValue();
23098}
23099
23101 SDValue SplatVal, unsigned NumVecElts) {
23102 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23103 Align OrigAlignment = St.getAlign();
23104 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23105
23106 // Create scalar stores. This is at least as good as the code sequence for a
23107 // split unaligned store which is a dup.s, ext.b, and two stores.
23108 // Most of the time the three stores should be replaced by store pair
23109 // instructions (stp).
23110 SDLoc DL(&St);
23111 SDValue BasePtr = St.getBasePtr();
23112 uint64_t BaseOffset = 0;
23113
23114 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23115 SDValue NewST1 =
23116 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23117 OrigAlignment, St.getMemOperand()->getFlags());
23118
23119 // As this in ISel, we will not merge this add which may degrade results.
23120 if (BasePtr->getOpcode() == ISD::ADD &&
23121 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23122 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23123 BasePtr = BasePtr->getOperand(0);
23124 }
23125
23126 unsigned Offset = EltOffset;
23127 while (--NumVecElts) {
23128 Align Alignment = commonAlignment(OrigAlignment, Offset);
23129 SDValue OffsetPtr =
23130 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23131 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23132 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23133 PtrInfo.getWithOffset(Offset), Alignment,
23134 St.getMemOperand()->getFlags());
23135 Offset += EltOffset;
23136 }
23137 return NewST1;
23138}
23139
23140// Returns an SVE type that ContentTy can be trivially sign or zero extended
23141// into.
23142static MVT getSVEContainerType(EVT ContentTy) {
23143 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23144
23145 switch (ContentTy.getSimpleVT().SimpleTy) {
23146 default:
23147 llvm_unreachable("No known SVE container for this MVT type");
23148 case MVT::nxv2i8:
23149 case MVT::nxv2i16:
23150 case MVT::nxv2i32:
23151 case MVT::nxv2i64:
23152 case MVT::nxv2f32:
23153 case MVT::nxv2f64:
23154 return MVT::nxv2i64;
23155 case MVT::nxv4i8:
23156 case MVT::nxv4i16:
23157 case MVT::nxv4i32:
23158 case MVT::nxv4f32:
23159 return MVT::nxv4i32;
23160 case MVT::nxv8i8:
23161 case MVT::nxv8i16:
23162 case MVT::nxv8f16:
23163 case MVT::nxv8bf16:
23164 return MVT::nxv8i16;
23165 case MVT::nxv16i8:
23166 return MVT::nxv16i8;
23167 }
23168}
23169
23171 SDLoc DL(N);
23172 EVT VT = N->getValueType(0);
23173
23175 return SDValue();
23176
23177 EVT ContainerVT = VT;
23178 if (ContainerVT.isInteger())
23179 ContainerVT = getSVEContainerType(ContainerVT);
23180
23181 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23182 SDValue Ops[] = { N->getOperand(0), // Chain
23183 N->getOperand(2), // Pg
23184 N->getOperand(3), // Base
23185 DAG.getValueType(VT) };
23186
23187 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23188 SDValue LoadChain = SDValue(Load.getNode(), 1);
23189
23190 if (ContainerVT.isInteger() && (VT != ContainerVT))
23191 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23192
23193 return DAG.getMergeValues({ Load, LoadChain }, DL);
23194}
23195
23197 SDLoc DL(N);
23198 EVT VT = N->getValueType(0);
23199 EVT PtrTy = N->getOperand(3).getValueType();
23200
23201 EVT LoadVT = VT;
23202 if (VT.isFloatingPoint())
23203 LoadVT = VT.changeTypeToInteger();
23204
23205 auto *MINode = cast<MemIntrinsicSDNode>(N);
23206 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23207 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23208 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23209 MINode->getOperand(2), PassThru,
23210 MINode->getMemoryVT(), MINode->getMemOperand(),
23212
23213 if (VT.isFloatingPoint()) {
23214 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23215 return DAG.getMergeValues(Ops, DL);
23216 }
23217
23218 return L;
23219}
23220
23221template <unsigned Opcode>
23223 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23224 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23225 "Unsupported opcode.");
23226 SDLoc DL(N);
23227 EVT VT = N->getValueType(0);
23228
23229 EVT LoadVT = VT;
23230 if (VT.isFloatingPoint())
23231 LoadVT = VT.changeTypeToInteger();
23232
23233 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23234 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23235 SDValue LoadChain = SDValue(Load.getNode(), 1);
23236
23237 if (VT.isFloatingPoint())
23238 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23239
23240 return DAG.getMergeValues({Load, LoadChain}, DL);
23241}
23242
23244 SDLoc DL(N);
23245 SDValue Data = N->getOperand(2);
23246 EVT DataVT = Data.getValueType();
23247 EVT HwSrcVt = getSVEContainerType(DataVT);
23248 SDValue InputVT = DAG.getValueType(DataVT);
23249
23250 if (DataVT.isFloatingPoint())
23251 InputVT = DAG.getValueType(HwSrcVt);
23252
23253 SDValue SrcNew;
23254 if (Data.getValueType().isFloatingPoint())
23255 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23256 else
23257 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23258
23259 SDValue Ops[] = { N->getOperand(0), // Chain
23260 SrcNew,
23261 N->getOperand(4), // Base
23262 N->getOperand(3), // Pg
23263 InputVT
23264 };
23265
23266 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23267}
23268
23270 SDLoc DL(N);
23271
23272 SDValue Data = N->getOperand(2);
23273 EVT DataVT = Data.getValueType();
23274 EVT PtrTy = N->getOperand(4).getValueType();
23275
23276 if (DataVT.isFloatingPoint())
23277 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
23278
23279 auto *MINode = cast<MemIntrinsicSDNode>(N);
23280 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
23281 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
23282 MINode->getMemoryVT(), MINode->getMemOperand(),
23283 ISD::UNINDEXED, false, false);
23284}
23285
23286/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
23287/// load store optimizer pass will merge them to store pair stores. This should
23288/// be better than a movi to create the vector zero followed by a vector store
23289/// if the zero constant is not re-used, since one instructions and one register
23290/// live range will be removed.
23291///
23292/// For example, the final generated code should be:
23293///
23294/// stp xzr, xzr, [x0]
23295///
23296/// instead of:
23297///
23298/// movi v0.2d, #0
23299/// str q0, [x0]
23300///
23302 SDValue StVal = St.getValue();
23303 EVT VT = StVal.getValueType();
23304
23305 // Avoid scalarizing zero splat stores for scalable vectors.
23306 if (VT.isScalableVector())
23307 return SDValue();
23308
23309 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
23310 // 2, 3 or 4 i32 elements.
23311 int NumVecElts = VT.getVectorNumElements();
23312 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
23313 VT.getVectorElementType().getSizeInBits() == 64) ||
23314 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
23315 VT.getVectorElementType().getSizeInBits() == 32)))
23316 return SDValue();
23317
23318 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
23319 return SDValue();
23320
23321 // If the zero constant has more than one use then the vector store could be
23322 // better since the constant mov will be amortized and stp q instructions
23323 // should be able to be formed.
23324 if (!StVal.hasOneUse())
23325 return SDValue();
23326
23327 // If the store is truncating then it's going down to i16 or smaller, which
23328 // means it can be implemented in a single store anyway.
23329 if (St.isTruncatingStore())
23330 return SDValue();
23331
23332 // If the immediate offset of the address operand is too large for the stp
23333 // instruction, then bail out.
23334 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
23335 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
23337 return SDValue();
23338 }
23339
23340 for (int I = 0; I < NumVecElts; ++I) {
23341 SDValue EltVal = StVal.getOperand(I);
23342 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
23343 return SDValue();
23344 }
23345
23346 // Use a CopyFromReg WZR/XZR here to prevent
23347 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
23348 SDLoc DL(&St);
23349 unsigned ZeroReg;
23350 EVT ZeroVT;
23351 if (VT.getVectorElementType().getSizeInBits() == 32) {
23352 ZeroReg = AArch64::WZR;
23353 ZeroVT = MVT::i32;
23354 } else {
23355 ZeroReg = AArch64::XZR;
23356 ZeroVT = MVT::i64;
23357 }
23358 SDValue SplatVal =
23359 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
23360 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23361}
23362
23363/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
23364/// value. The load store optimizer pass will merge them to store pair stores.
23365/// This has better performance than a splat of the scalar followed by a split
23366/// vector store. Even if the stores are not merged it is four stores vs a dup,
23367/// followed by an ext.b and two stores.
23369 SDValue StVal = St.getValue();
23370 EVT VT = StVal.getValueType();
23371
23372 // Don't replace floating point stores, they possibly won't be transformed to
23373 // stp because of the store pair suppress pass.
23374 if (VT.isFloatingPoint())
23375 return SDValue();
23376
23377 // We can express a splat as store pair(s) for 2 or 4 elements.
23378 unsigned NumVecElts = VT.getVectorNumElements();
23379 if (NumVecElts != 4 && NumVecElts != 2)
23380 return SDValue();
23381
23382 // If the store is truncating then it's going down to i16 or smaller, which
23383 // means it can be implemented in a single store anyway.
23384 if (St.isTruncatingStore())
23385 return SDValue();
23386
23387 // Check that this is a splat.
23388 // Make sure that each of the relevant vector element locations are inserted
23389 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
23390 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
23391 SDValue SplatVal;
23392 for (unsigned I = 0; I < NumVecElts; ++I) {
23393 // Check for insert vector elements.
23394 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
23395 return SDValue();
23396
23397 // Check that same value is inserted at each vector element.
23398 if (I == 0)
23399 SplatVal = StVal.getOperand(1);
23400 else if (StVal.getOperand(1) != SplatVal)
23401 return SDValue();
23402
23403 // Check insert element index.
23405 if (!CIndex)
23406 return SDValue();
23407 uint64_t IndexVal = CIndex->getZExtValue();
23408 if (IndexVal >= NumVecElts)
23409 return SDValue();
23410 IndexNotInserted.reset(IndexVal);
23411
23412 StVal = StVal.getOperand(0);
23413 }
23414 // Check that all vector element locations were inserted to.
23415 if (IndexNotInserted.any())
23416 return SDValue();
23417
23418 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23419}
23420
23422 SelectionDAG &DAG,
23423 const AArch64Subtarget *Subtarget) {
23424
23426 if (S->isVolatile() || S->isIndexed())
23427 return SDValue();
23428
23429 SDValue StVal = S->getValue();
23430 EVT VT = StVal.getValueType();
23431
23432 if (!VT.isFixedLengthVector())
23433 return SDValue();
23434
23435 // If we get a splat of zeros, convert this vector store to a store of
23436 // scalars. They will be merged into store pairs of xzr thereby removing one
23437 // instruction and one register.
23438 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
23439 return ReplacedZeroSplat;
23440
23441 // FIXME: The logic for deciding if an unaligned store should be split should
23442 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
23443 // a call to that function here.
23444
23445 if (!Subtarget->isMisaligned128StoreSlow())
23446 return SDValue();
23447
23448 // Don't split at -Oz.
23450 return SDValue();
23451
23452 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
23453 // those up regresses performance on micro-benchmarks and olden/bh.
23454 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
23455 return SDValue();
23456
23457 // Split unaligned 16B stores. They are terrible for performance.
23458 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
23459 // extensions can use this to mark that it does not want splitting to happen
23460 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
23461 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
23462 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
23463 S->getAlign() <= Align(2))
23464 return SDValue();
23465
23466 // If we get a splat of a scalar convert this vector store to a store of
23467 // scalars. They will be merged into store pairs thereby removing two
23468 // instructions.
23469 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
23470 return ReplacedSplat;
23471
23472 SDLoc DL(S);
23473
23474 // Split VT into two.
23475 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23476 unsigned NumElts = HalfVT.getVectorNumElements();
23477 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23478 DAG.getConstant(0, DL, MVT::i64));
23479 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23480 DAG.getConstant(NumElts, DL, MVT::i64));
23481 SDValue BasePtr = S->getBasePtr();
23482 SDValue NewST1 =
23483 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
23484 S->getAlign(), S->getMemOperand()->getFlags());
23485 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23486 DAG.getConstant(8, DL, MVT::i64));
23487 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
23488 S->getPointerInfo(), S->getAlign(),
23489 S->getMemOperand()->getFlags());
23490}
23491
23493 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
23494
23495 // splice(pg, op1, undef) -> op1
23496 if (N->getOperand(2).isUndef())
23497 return N->getOperand(1);
23498
23499 return SDValue();
23500}
23501
23503 const AArch64Subtarget *Subtarget) {
23504 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
23505 N->getOpcode() == AArch64ISD::UUNPKLO) &&
23506 "Unexpected Opcode!");
23507
23508 // uunpklo/hi undef -> undef
23509 if (N->getOperand(0).isUndef())
23510 return DAG.getUNDEF(N->getValueType(0));
23511
23512 // If this is a masked load followed by an UUNPKLO, fold this into a masked
23513 // extending load. We can do this even if this is already a masked
23514 // {z,}extload.
23515 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
23516 N->getOpcode() == AArch64ISD::UUNPKLO) {
23517 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
23518 SDValue Mask = MLD->getMask();
23519 SDLoc DL(N);
23520
23521 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
23522 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23523 (MLD->getPassThru()->isUndef() ||
23524 isZerosVector(MLD->getPassThru().getNode()))) {
23525 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23526 unsigned PgPattern = Mask->getConstantOperandVal(0);
23527 EVT VT = N->getValueType(0);
23528
23529 // Ensure we can double the size of the predicate pattern
23530 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23531 if (NumElts &&
23532 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
23533 Mask =
23534 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
23535 SDValue PassThru = DAG.getConstant(0, DL, VT);
23536 SDValue NewLoad = DAG.getMaskedLoad(
23537 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
23538 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
23540
23541 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
23542
23543 return NewLoad;
23544 }
23545 }
23546 }
23547
23548 return SDValue();
23549}
23550
23552 if (N->getOpcode() != AArch64ISD::UZP1)
23553 return false;
23554 SDValue Op0 = N->getOperand(0);
23555 EVT SrcVT = Op0->getValueType(0);
23556 EVT DstVT = N->getValueType(0);
23557 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23558 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23559 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23560}
23561
23562// Try to combine rounding shifts where the operands come from an extend, and
23563// the result is truncated and combined into one vector.
23564// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23566 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23567 SDValue Op0 = N->getOperand(0);
23568 SDValue Op1 = N->getOperand(1);
23569 EVT ResVT = N->getValueType(0);
23570
23571 unsigned RshOpc = Op0.getOpcode();
23572 if (RshOpc != AArch64ISD::RSHRNB_I)
23573 return SDValue();
23574
23575 // Same op code and imm value?
23576 SDValue ShiftValue = Op0.getOperand(1);
23577 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23578 return SDValue();
23579
23580 // Same unextended operand value?
23581 SDValue Lo = Op0.getOperand(0);
23582 SDValue Hi = Op1.getOperand(0);
23583 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23584 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23585 return SDValue();
23586 SDValue OrigArg = Lo.getOperand(0);
23587 if (OrigArg != Hi.getOperand(0))
23588 return SDValue();
23589
23590 SDLoc DL(N);
23591 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23592 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23593 ShiftValue);
23594}
23595
23596// Try to simplify:
23597// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23598// t2 = nxv8i16 srl(t1, ShiftValue)
23599// to
23600// t1 = nxv8i16 rshrnb(X, shiftvalue).
23601// rshrnb will zero the top half bits of each element. Therefore, this combine
23602// should only be performed when a following instruction with the rshrnb
23603// as an operand does not care about the top half of each element. For example,
23604// a uzp1 or a truncating store.
23606 const AArch64Subtarget *Subtarget) {
23607 EVT VT = Srl->getValueType(0);
23608 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23609 return SDValue();
23610
23611 EVT ResVT;
23612 if (VT == MVT::nxv8i16)
23613 ResVT = MVT::nxv16i8;
23614 else if (VT == MVT::nxv4i32)
23615 ResVT = MVT::nxv8i16;
23616 else if (VT == MVT::nxv2i64)
23617 ResVT = MVT::nxv4i32;
23618 else
23619 return SDValue();
23620
23621 SDLoc DL(Srl);
23622 unsigned ShiftValue;
23623 SDValue RShOperand;
23624 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23625 return SDValue();
23626 SDValue Rshrnb = DAG.getNode(
23627 AArch64ISD::RSHRNB_I, DL, ResVT,
23628 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23629 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23630}
23631
23633 if (V.getOpcode() != AArch64ISD::NVCAST)
23634 return SDValue();
23635
23636 SDValue Op = V.getOperand(0);
23637 if (!Op.getValueType().isVector() ||
23638 V.getValueType().getVectorElementCount() !=
23639 Op.getValueType().getVectorElementCount() * 2)
23640 return SDValue();
23641
23642 return Op;
23643}
23644
23646 const AArch64Subtarget *Subtarget) {
23647 SDLoc DL(N);
23648 SDValue Op0 = N->getOperand(0);
23649 SDValue Op1 = N->getOperand(1);
23650 EVT ResVT = N->getValueType(0);
23651
23652 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23653 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23655 Op0.getOperand(0) == Op1.getOperand(0)) {
23656
23657 SDValue SourceVec = Op0.getOperand(0);
23658 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23659 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23660 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23661 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23662 EVT OpVT = Op0.getOperand(1).getValueType();
23663 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23664 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23665 DAG.getUNDEF(WidenedResVT));
23666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23667 DAG.getConstant(0, DL, OpVT));
23668 }
23669 }
23670
23671 // Following optimizations only work with uzp1.
23672 if (N->getOpcode() == AArch64ISD::UZP2)
23673 return SDValue();
23674
23675 // uzp1(x, undef) -> concat(truncate(x), undef)
23676 if (Op1.getOpcode() == ISD::UNDEF) {
23677 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23678 switch (ResVT.getSimpleVT().SimpleTy) {
23679 default:
23680 break;
23681 case MVT::v16i8:
23682 BCVT = MVT::v8i16;
23683 HalfVT = MVT::v8i8;
23684 break;
23685 case MVT::v8i16:
23686 BCVT = MVT::v4i32;
23687 HalfVT = MVT::v4i16;
23688 break;
23689 case MVT::v4i32:
23690 BCVT = MVT::v2i64;
23691 HalfVT = MVT::v2i32;
23692 break;
23693 }
23694 if (BCVT != MVT::Other) {
23695 SDValue BC = DAG.getBitcast(BCVT, Op0);
23696 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23697 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23698 DAG.getUNDEF(HalfVT));
23699 }
23700 }
23701
23702 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23703 return Urshr;
23704
23705 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23706 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23707 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23708 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23709 }
23710 }
23711
23712 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23713 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23714 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23715 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23716 }
23717 }
23718
23719 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23720 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23721 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23722 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23723 SDValue X = PreCast.getOperand(0).getOperand(0);
23724 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23725 }
23726 }
23727 }
23728
23729 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23730 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23731 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23732 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23733 SDValue Z = PreCast.getOperand(0).getOperand(1);
23734 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23735 }
23736 }
23737 }
23738
23739 // These optimizations only work on little endian.
23740 if (!DAG.getDataLayout().isLittleEndian())
23741 return SDValue();
23742
23743 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23744 // Example:
23745 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23746 // to
23747 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23749 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23750 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23751 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23752 Op1.getOperand(0));
23753 }
23754 }
23755
23756 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23757 return SDValue();
23758
23759 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23760 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23761
23762 // truncating uzp1(x, y) -> xtn(concat (x, y))
23763 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23764 EVT Op0Ty = SourceOp0.getValueType();
23765 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23766 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23767 SDValue Concat =
23770 SourceOp0, SourceOp1);
23771 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23772 }
23773 }
23774
23775 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23776 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23777 SourceOp1.getOpcode() != ISD::TRUNCATE)
23778 return SDValue();
23779 SourceOp0 = SourceOp0.getOperand(0);
23780 SourceOp1 = SourceOp1.getOperand(0);
23781
23782 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23783 !SourceOp0.getValueType().isSimple())
23784 return SDValue();
23785
23786 EVT ResultTy;
23787
23788 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23789 case MVT::v2i64:
23790 ResultTy = MVT::v4i32;
23791 break;
23792 case MVT::v4i32:
23793 ResultTy = MVT::v8i16;
23794 break;
23795 case MVT::v8i16:
23796 ResultTy = MVT::v16i8;
23797 break;
23798 default:
23799 return SDValue();
23800 }
23801
23802 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23803 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23804 SDValue UzpResult =
23805 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23806
23807 EVT BitcastResultTy;
23808
23809 switch (ResVT.getSimpleVT().SimpleTy) {
23810 case MVT::v2i32:
23811 BitcastResultTy = MVT::v2i64;
23812 break;
23813 case MVT::v4i16:
23814 BitcastResultTy = MVT::v4i32;
23815 break;
23816 case MVT::v8i8:
23817 BitcastResultTy = MVT::v8i16;
23818 break;
23819 default:
23820 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23821 }
23822
23823 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23824 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23825}
23826
23828 unsigned Opc = N->getOpcode();
23829
23830 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23831 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23832 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23833 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23834 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23835 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23836 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23837 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23838
23839 SDLoc DL(N);
23840 SDValue Chain = N->getOperand(0);
23841 SDValue Pg = N->getOperand(1);
23842 SDValue Base = N->getOperand(2);
23843 SDValue Offset = N->getOperand(3);
23844 SDValue Ty = N->getOperand(4);
23845
23846 EVT ResVT = N->getValueType(0);
23847
23848 const auto OffsetOpc = Offset.getOpcode();
23849 const bool OffsetIsZExt =
23850 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23851 const bool OffsetIsSExt =
23852 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23853
23854 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23855 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23856 SDValue ExtPg = Offset.getOperand(0);
23857 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23858 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23859
23860 // If the predicate for the sign- or zero-extended offset is the
23861 // same as the predicate used for this load and the sign-/zero-extension
23862 // was from a 32-bits...
23863 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23864 SDValue UnextendedOffset = Offset.getOperand(1);
23865
23866 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23867 if (Signed)
23868 NewOpc = getSignExtendedGatherOpcode(NewOpc);
23869
23870 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23871 {Chain, Pg, Base, UnextendedOffset, Ty});
23872 }
23873 }
23874
23875 return SDValue();
23876}
23877
23878/// Optimize a vector shift instruction and its operand if shifted out
23879/// bits are not used.
23881 const AArch64TargetLowering &TLI,
23883 assert(N->getOpcode() == AArch64ISD::VASHR ||
23884 N->getOpcode() == AArch64ISD::VLSHR);
23885
23886 SDValue Op = N->getOperand(0);
23887 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23888
23889 unsigned ShiftImm = N->getConstantOperandVal(1);
23890 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23891
23892 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23893 if (N->getOpcode() == AArch64ISD::VASHR &&
23894 Op.getOpcode() == AArch64ISD::VSHL &&
23895 N->getOperand(1) == Op.getOperand(1))
23896 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23897 return Op.getOperand(0);
23898
23899 // If the shift is exact, the shifted out bits matter.
23900 if (N->getFlags().hasExact())
23901 return SDValue();
23902
23903 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23904 APInt DemandedMask = ~ShiftedOutBits;
23905
23906 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23907 return SDValue(N, 0);
23908
23909 return SDValue();
23910}
23911
23913 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23914 // This transform works in partnership with performSetCCPunpkCombine to
23915 // remove unnecessary transfer of predicates into standard registers and back
23916 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23917 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23918 MVT::i1) {
23919 SDValue CC = N->getOperand(0)->getOperand(0);
23920 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
23921 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
23922 DAG.getVectorIdxConstant(0, SDLoc(N)));
23923 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
23924 }
23925
23926 return SDValue();
23927}
23928
23929/// Target-specific DAG combine function for post-increment LD1 (lane) and
23930/// post-increment LD1R.
23933 bool IsLaneOp) {
23934 if (DCI.isBeforeLegalizeOps())
23935 return SDValue();
23936
23937 SelectionDAG &DAG = DCI.DAG;
23938 EVT VT = N->getValueType(0);
23939
23940 if (!VT.is128BitVector() && !VT.is64BitVector())
23941 return SDValue();
23942
23943 // If it is not LOAD, can not do such combine.
23944 unsigned LoadIdx = IsLaneOp ? 1 : 0;
23945 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
23946 if (!LD)
23947 return SDValue();
23948
23949 // If the Generic combiner already helped form a pre- or post-indexed load,
23950 // skip forming one here.
23951 if (LD->isIndexed())
23952 return SDValue();
23953
23954 // The vector lane must be a constant in the LD1LANE opcode.
23955 SDValue Lane;
23956 if (IsLaneOp) {
23957 Lane = N->getOperand(2);
23958 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
23959 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23960 return SDValue();
23961 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
23962 return SDValue();
23963 }
23964
23965 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
23966 EVT MemVT = LoadSDN->getMemoryVT();
23967 // Check if memory operand is the same type as the vector element.
23968 if (MemVT != VT.getVectorElementType())
23969 return SDValue();
23970
23971 // Check if there are other uses. If so, do not combine as it will introduce
23972 // an extra load.
23973 for (SDUse &U : LD->uses()) {
23974 if (U.getResNo() == 1) // Ignore uses of the chain result.
23975 continue;
23976 if (U.getUser() != N)
23977 return SDValue();
23978 }
23979
23980 // If there is one use and it can splat the value, prefer that operation.
23981 // TODO: This could be expanded to more operations if they reliably use the
23982 // index variants.
23983 if (N->hasOneUse()) {
23984 unsigned UseOpc = N->user_begin()->getOpcode();
23985 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23986 return SDValue();
23987 }
23988
23989 SDValue Addr = LD->getOperand(1);
23990 SDValue Vector = N->getOperand(0);
23991 // Search for a use of the address operand that is an increment.
23992 for (SDUse &Use : Addr->uses()) {
23993 SDNode *User = Use.getUser();
23994 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23995 continue;
23996
23997 // If the increment is a constant, it must match the memory ref size.
23998 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23999 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24000 uint32_t IncVal = CInc->getZExtValue();
24001 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
24002 if (IncVal != NumBytes)
24003 continue;
24004 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24005 }
24006
24007 // To avoid cycle construction make sure that neither the load nor the add
24008 // are predecessors to each other or the Vector.
24011 Visited.insert(Addr.getNode());
24012 Worklist.push_back(User);
24013 Worklist.push_back(LD);
24014 Worklist.push_back(Vector.getNode());
24015 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
24016 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24017 continue;
24018
24020 Ops.push_back(LD->getOperand(0)); // Chain
24021 if (IsLaneOp) {
24022 Ops.push_back(Vector); // The vector to be inserted
24023 Ops.push_back(Lane); // The lane to be inserted in the vector
24024 }
24025 Ops.push_back(Addr);
24026 Ops.push_back(Inc);
24027
24028 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24029 SDVTList SDTys = DAG.getVTList(Tys);
24030 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24031 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24032 MemVT,
24033 LoadSDN->getMemOperand());
24034
24035 // Update the uses.
24036 SDValue NewResults[] = {
24037 SDValue(LD, 0), // The result of load
24038 SDValue(UpdN.getNode(), 2) // Chain
24039 };
24040 DCI.CombineTo(LD, NewResults);
24041 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24042 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24043
24044 break;
24045 }
24046 return SDValue();
24047}
24048
24049/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24050/// address translation.
24051static bool performTBISimplification(SDValue Addr,
24053 SelectionDAG &DAG) {
24054 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
24055 KnownBits Known;
24057 !DCI.isBeforeLegalizeOps());
24058 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24059 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24060 DCI.CommitTargetLoweringOpt(TLO);
24061 return true;
24062 }
24063 return false;
24064}
24065
24066static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
24067 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24068 "Expected STORE dag node in input!");
24069
24070 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24071 if (!Store->isTruncatingStore() || Store->isIndexed())
24072 return SDValue();
24073 SDValue Ext = Store->getValue();
24074 auto ExtOpCode = Ext.getOpcode();
24075 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24076 ExtOpCode != ISD::ANY_EXTEND)
24077 return SDValue();
24078 SDValue Orig = Ext->getOperand(0);
24079 if (Store->getMemoryVT() != Orig.getValueType())
24080 return SDValue();
24081 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24082 Store->getBasePtr(), Store->getMemOperand());
24083 }
24084
24085 return SDValue();
24086}
24087
24088// A custom combine to lower load <3 x i8> as the more efficient sequence
24089// below:
24090// ldrb wX, [x0, #2]
24091// ldrh wY, [x0]
24092// orr wX, wY, wX, lsl #16
24093// fmov s0, wX
24094//
24095// Note that an alternative sequence with even fewer (although usually more
24096// complex/expensive) instructions would be:
24097// ld1r.4h { v0 }, [x0], #2
24098// ld1.b { v0 }[2], [x0]
24099//
24100// Generating this sequence unfortunately results in noticeably worse codegen
24101// for code that extends the loaded v3i8, due to legalization breaking vector
24102// shuffle detection in a way that is very difficult to work around.
24103// TODO: Revisit once v3i8 legalization has been improved in general.
24104static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
24105 EVT MemVT = LD->getMemoryVT();
24106 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24107 LD->getBaseAlign() >= 4)
24108 return SDValue();
24109
24110 SDLoc DL(LD);
24112 SDValue Chain = LD->getChain();
24113 SDValue BasePtr = LD->getBasePtr();
24114 MachineMemOperand *MMO = LD->getMemOperand();
24115 assert(LD->getOffset().isUndef() && "undef offset expected");
24116
24117 // Load 2 x i8, then 1 x i8.
24118 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24119 TypeSize Offset2 = TypeSize::getFixed(2);
24120 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24121 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24122 MF.getMachineMemOperand(MMO, 2, 1));
24123
24124 // Extend to i32.
24125 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24126 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24127
24128 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24129 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24130 DAG.getConstant(16, DL, MVT::i32));
24131 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24132 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24133
24134 // Extract v3i8 again.
24135 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24136 DAG.getConstant(0, DL, MVT::i64));
24138 ISD::TokenFactor, DL, MVT::Other,
24139 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24140 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24141}
24142
24143// Perform TBI simplification if supported by the target and try to break up
24144// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24145// load instructions can be selected.
24146static SDValue performLOADCombine(SDNode *N,
24148 SelectionDAG &DAG,
24149 const AArch64Subtarget *Subtarget) {
24150 if (Subtarget->supportsAddressTopByteIgnored())
24151 performTBISimplification(N->getOperand(1), DCI, DAG);
24152
24154 EVT RegVT = LD->getValueType(0);
24155 EVT MemVT = LD->getMemoryVT();
24156 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24157 SDLoc DL(LD);
24158
24159 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24160 unsigned AddrSpace = LD->getAddressSpace();
24161 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24162 AddrSpace == ARM64AS::PTR32_UPTR) {
24163 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24164 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24165 SDValue Cast =
24166 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24167 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24168 Cast, LD->getPointerInfo(), MemVT,
24169 LD->getBaseAlign(),
24170 LD->getMemOperand()->getFlags());
24171 }
24172 }
24173
24174 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24175 return SDValue(N, 0);
24176
24177 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24178 return Res;
24179
24180 if (!LD->isNonTemporal())
24181 return SDValue(N, 0);
24182
24183 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24184 MemVT.getSizeInBits() % 256 == 0 ||
24185 256 % MemVT.getScalarSizeInBits() != 0)
24186 return SDValue(N, 0);
24187
24188 SDValue Chain = LD->getChain();
24189 SDValue BasePtr = LD->getBasePtr();
24190 SDNodeFlags Flags = LD->getFlags();
24192 SmallVector<SDValue, 4> LoadOpsChain;
24193 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24194 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24195 // loads and reduce the amount of load instructions generated.
24196 MVT NewVT =
24198 256 / MemVT.getVectorElementType().getSizeInBits());
24199 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24200 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24201 for (unsigned I = 0; I < Num256Loads; I++) {
24202 unsigned PtrOffset = I * 32;
24203 SDValue NewPtr = DAG.getMemBasePlusOffset(
24204 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24205 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24206 SDValue NewLoad = DAG.getLoad(
24207 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24208 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24209 LoadOps.push_back(NewLoad);
24210 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24211 }
24212
24213 // Process remaining bits of the load operation.
24214 // This is done by creating an UNDEF vector to match the size of the
24215 // 256-bit loads and inserting the remaining load to it. We extract the
24216 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24217 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24218 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24219 MVT RemainingVT = MVT::getVectorVT(
24221 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24222 SDValue NewPtr = DAG.getMemBasePlusOffset(
24223 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24224 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24225 SDValue RemainingLoad =
24226 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24227 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24228 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24229 SDValue UndefVector = DAG.getUNDEF(NewVT);
24230 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24231 SDValue ExtendedRemainingLoad =
24232 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24233 {UndefVector, RemainingLoad, InsertIdx});
24234 LoadOps.push_back(ExtendedRemainingLoad);
24235 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24236 EVT ConcatVT =
24238 LoadOps.size() * NewVT.getVectorNumElements());
24239 SDValue ConcatVectors =
24240 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24241 // Extract the original vector type size.
24242 SDValue ExtractSubVector =
24243 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24244 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24246 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24247 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24248}
24249
24250static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
24251 EVT VecVT = Op.getValueType();
24252 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24253 "Need boolean vector type.");
24254
24255 if (Depth > 3)
24257
24258 // We can get the base type from a vector compare or truncate.
24259 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24260 return Op.getOperand(0).getValueType();
24261
24262 // If an operand is a bool vector, continue looking.
24264 for (SDValue Operand : Op->op_values()) {
24265 if (Operand.getValueType() != VecVT)
24266 continue;
24267
24268 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24269 if (!BaseVT.isSimple())
24270 BaseVT = OperandVT;
24271 else if (OperandVT != BaseVT)
24273 }
24274
24275 return BaseVT;
24276}
24277
24278// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
24279// iN, we can use a trick that extracts the i^th bit from the i^th element and
24280// then performs a vector add to get a scalar bitmask. This requires that each
24281// element's bits are either all 1 or all 0.
24282static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
24283 SDLoc DL(N);
24284 SDValue ComparisonResult(N, 0);
24285 EVT VecVT = ComparisonResult.getValueType();
24286 assert(VecVT.isVector() && "Must be a vector type");
24287
24288 unsigned NumElts = VecVT.getVectorNumElements();
24289 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
24290 return SDValue();
24291
24292 if (VecVT.getVectorElementType() != MVT::i1 &&
24293 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
24294 return SDValue();
24295
24296 // If we can find the original types to work on instead of a vector of i1,
24297 // we can avoid extend/extract conversion instructions.
24298 if (VecVT.getVectorElementType() == MVT::i1) {
24299 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
24300 if (!VecVT.isSimple()) {
24301 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
24302 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
24303 }
24304 }
24305 VecVT = VecVT.changeVectorElementTypeToInteger();
24306
24307 // Large vectors don't map directly to this conversion, so to avoid too many
24308 // edge cases, we don't apply it here. The conversion will likely still be
24309 // applied later via multiple smaller vectors, whose results are concatenated.
24310 if (VecVT.getSizeInBits() > 128)
24311 return SDValue();
24312
24313 // Ensure that all elements' bits are either 0s or 1s.
24314 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
24315
24316 bool IsLE = DAG.getDataLayout().isLittleEndian();
24317 SmallVector<SDValue, 16> MaskConstants;
24319 VecVT == MVT::v16i8) {
24320 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
24321 // per entry. We split it into two halves, apply the mask, zip the halves to
24322 // create 8x 16-bit values, and the perform the vector reduce.
24323 for (unsigned Half = 0; Half < 2; ++Half) {
24324 for (unsigned I = 0; I < 8; ++I) {
24325 // On big-endian targets, the lane order in sub-byte vector elements
24326 // gets reversed, so we need to flip the bit index.
24327 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
24328 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
24329 }
24330 }
24331 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24332 SDValue RepresentativeBits =
24333 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24334
24335 SDValue UpperRepresentativeBits =
24336 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
24337 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
24338 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
24339 RepresentativeBits, UpperRepresentativeBits);
24340 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
24341 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
24342 }
24343
24344 // All other vector sizes.
24345 unsigned NumEl = VecVT.getVectorNumElements();
24346 for (unsigned I = 0; I < NumEl; ++I) {
24347 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
24348 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
24349 }
24350
24351 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24352 SDValue RepresentativeBits =
24353 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24354 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
24355 NumElts, VecVT.getVectorElementType().getSizeInBits()));
24356 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
24357}
24358
24359static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
24360 StoreSDNode *Store) {
24361 if (!Store->isTruncatingStore())
24362 return SDValue();
24363
24364 SDLoc DL(Store);
24365 SDValue VecOp = Store->getValue();
24366 EVT VT = VecOp.getValueType();
24367 EVT MemVT = Store->getMemoryVT();
24368
24369 if (!MemVT.isVector() || !VT.isVector() ||
24370 MemVT.getVectorElementType() != MVT::i1)
24371 return SDValue();
24372
24373 // If we are storing a vector that we are currently building, let
24374 // `scalarizeVectorStore()` handle this more efficiently.
24375 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
24376 return SDValue();
24377
24378 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
24379 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
24380 if (!VectorBits)
24381 return SDValue();
24382
24383 EVT StoreVT =
24385 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
24386 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
24387 Store->getMemOperand());
24388}
24389
24390// Combine store (fp_to_int X) to use vector semantics around the conversion
24391// when NEON is available. This allows us to store the in-vector result directly
24392// without transferring the result into a GPR in the process.
24393static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
24395 SelectionDAG &DAG,
24396 const AArch64Subtarget *Subtarget) {
24397 // Limit to post-legalization in order to avoid peeling truncating stores.
24398 if (DCI.isBeforeLegalize())
24399 return SDValue();
24400 if (!Subtarget->isNeonAvailable())
24401 return SDValue();
24402 // Source operand is already a vector.
24403 SDValue Value = ST->getValue();
24404 if (Value.getValueType().isVector())
24405 return SDValue();
24406
24407 // Look through potential assertions.
24408 while (Value->isAssert())
24409 Value = Value.getOperand(0);
24410
24411 if (Value.getOpcode() != ISD::FP_TO_SINT &&
24412 Value.getOpcode() != ISD::FP_TO_UINT)
24413 return SDValue();
24414 if (!Value->hasOneUse())
24415 return SDValue();
24416
24417 SDValue FPSrc = Value.getOperand(0);
24418 EVT SrcVT = FPSrc.getValueType();
24419 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24420 return SDValue();
24421
24422 // No support for assignments such as i64 = fp_to_sint i32
24423 EVT VT = Value.getSimpleValueType();
24424 if (VT != SrcVT.changeTypeToInteger())
24425 return SDValue();
24426
24427 // Create a 128-bit element vector to avoid widening. The floating point
24428 // conversion is transformed into a single element conversion via a pattern.
24429 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24430 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24431 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24432 SDLoc DL(ST);
24433 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24434 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24435
24437 SDValue Extracted =
24438 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24439
24440 DCI.CombineTo(ST->getValue().getNode(), Extracted);
24441 return SDValue(ST, 0);
24442}
24443
24444bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
24445 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
24446 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
24447 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
24448}
24449
24450// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
24451static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
24452 const AArch64Subtarget *Subtarget) {
24453 SDValue Value = ST->getValue();
24454 EVT ValueVT = Value.getValueType();
24455
24456 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
24457 Value.getOpcode() != ISD::TRUNCATE ||
24458 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
24459 return SDValue();
24460
24461 assert(ST->getOffset().isUndef() && "undef offset expected");
24462 SDLoc DL(ST);
24463 auto WideVT = EVT::getVectorVT(
24464 *DAG.getContext(),
24465 Value->getOperand(0).getValueType().getVectorElementType(), 4);
24466 SDValue UndefVector = DAG.getUNDEF(WideVT);
24467 SDValue WideTrunc = DAG.getNode(
24468 ISD::INSERT_SUBVECTOR, DL, WideVT,
24469 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
24470 SDValue Cast = DAG.getNode(
24471 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
24472 WideTrunc);
24473
24475 SDValue Chain = ST->getChain();
24476 MachineMemOperand *MMO = ST->getMemOperand();
24477 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
24478 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24479 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
24480 TypeSize Offset2 = TypeSize::getFixed(2);
24481 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
24482 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
24483
24484 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24485 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
24486 TypeSize Offset1 = TypeSize::getFixed(1);
24487 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
24488 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
24489
24490 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24491 DAG.getConstant(0, DL, MVT::i64));
24492 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
24493 MF.getMachineMemOperand(MMO, 0, 1));
24494 return Chain;
24495}
24496
24497static unsigned getFPSubregForVT(EVT VT) {
24498 assert(VT.isSimple() && "Expected simple VT");
24499 switch (VT.getSimpleVT().SimpleTy) {
24500 case MVT::aarch64mfp8:
24501 return AArch64::bsub;
24502 case MVT::f16:
24503 return AArch64::hsub;
24504 case MVT::f32:
24505 return AArch64::ssub;
24506 case MVT::f64:
24507 return AArch64::dsub;
24508 default:
24509 llvm_unreachable("Unexpected VT!");
24510 }
24511}
24512
24513static SDValue performSTORECombine(SDNode *N,
24515 SelectionDAG &DAG,
24516 const AArch64Subtarget *Subtarget) {
24518 SDValue Chain = ST->getChain();
24519 SDValue Value = ST->getValue();
24520 SDValue Ptr = ST->getBasePtr();
24521 EVT ValueVT = Value.getValueType();
24522 EVT MemVT = ST->getMemoryVT();
24523 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24524 SDLoc DL(ST);
24525
24526 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24527 return Res;
24528
24529 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
24530 EVT EltVT = VT.getVectorElementType();
24531 return EltVT == MVT::f32 || EltVT == MVT::f64;
24532 };
24533
24534 // Cast ptr32 and ptr64 pointers to the default address space before a store.
24535 unsigned AddrSpace = ST->getAddressSpace();
24536 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24537 AddrSpace == ARM64AS::PTR32_UPTR) {
24538 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24539 if (PtrVT != Ptr.getSimpleValueType()) {
24540 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
24541 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
24542 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
24543 ST->getAAInfo());
24544 }
24545 }
24546
24547 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
24548 return Res;
24549
24550 // If this is an FP_ROUND followed by a store, fold this into a truncating
24551 // store. We can do this even if this is already a truncstore.
24552 // We purposefully don't care about legality of the nodes here as we know
24553 // they can be split down into something legal.
24554 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
24555 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
24556 Subtarget->useSVEForFixedLengthVectors() &&
24557 ValueVT.isFixedLengthVector() &&
24558 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
24559 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
24560 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
24561 ST->getMemOperand());
24562
24563 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
24564 return Split;
24565
24566 if (Subtarget->supportsAddressTopByteIgnored() &&
24567 performTBISimplification(N->getOperand(2), DCI, DAG))
24568 return SDValue(N, 0);
24569
24570 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
24571 return Store;
24572
24573 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
24574 return Store;
24575
24576 if (ST->isTruncatingStore() &&
24577 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
24578 if (SDValue Rshrnb =
24579 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
24580 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24581 MemVT, ST->getMemOperand());
24582 }
24583 }
24584
24585 // This is an integer vector_extract_elt followed by a (possibly truncating)
24586 // store. We may be able to replace this with a store of an FP subregister.
24587 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24588 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24589
24590 SDValue Vector = Value.getOperand(0);
24591 SDValue ExtIdx = Value.getOperand(1);
24592 EVT VectorVT = Vector.getValueType();
24593 EVT ElemVT = VectorVT.getVectorElementType();
24594
24595 if (!ValueVT.isInteger())
24596 return SDValue();
24597
24598 // Propagate zero constants (applying this fold may miss optimizations).
24600 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
24601 DAG.ReplaceAllUsesWith(Value, ZeroElt);
24602 return SDValue();
24603 }
24604
24605 if (ValueVT != MemVT && !ST->isTruncatingStore())
24606 return SDValue();
24607
24608 // This could generate an additional extract if the index is non-zero and
24609 // the extracted value has multiple uses.
24610 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24611 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24612 return SDValue();
24613
24614 // These can lower to st1, which is preferable if we're unlikely to fold the
24615 // addressing into the store.
24616 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24617 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24618 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24619 return SDValue();
24620
24621 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24622 // Heuristic: If there are other users of w/x integer scalars extracted
24623 // from this vector that won't fold into the store -- abandon folding.
24624 // Applying this fold may disrupt paired stores.
24625 for (const auto &Use : Vector->uses()) {
24626 if (Use.getResNo() != Vector.getResNo())
24627 continue;
24628 const SDNode *User = Use.getUser();
24629 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24630 (!User->hasOneUse() ||
24631 (*User->user_begin())->getOpcode() != ISD::STORE))
24632 return SDValue();
24633 }
24634 }
24635
24636 SDValue ExtVector = Vector;
24637 if (!ExtCst || !ExtCst->isZero()) {
24638 // Handle extracting from lanes != 0.
24640 Value.getValueType(), Vector, ExtIdx);
24642 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
24643 DAG.getUNDEF(VectorVT), Ext, Zero);
24644 }
24645
24646 EVT FPMemVT = MemVT == MVT::i8
24647 ? MVT::aarch64mfp8
24649 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24650 FPMemVT, ExtVector);
24651
24652 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
24653 ST->getMemOperand());
24654 }
24655
24656 return SDValue();
24657}
24658
24659static bool
24660isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
24661 if (N->getOpcode() != ISD::CONCAT_VECTORS)
24662 return false;
24663
24664 unsigned NumParts = N->getNumOperands();
24665
24666 // We should be concatenating each sequential result from a
24667 // VECTOR_INTERLEAVE.
24668 SDNode *InterleaveOp = N->getOperand(0).getNode();
24669 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
24670 InterleaveOp->getNumOperands() != NumParts)
24671 return false;
24672
24673 for (unsigned I = 0; I < NumParts; I++)
24674 if (N->getOperand(I) != SDValue(InterleaveOp, I))
24675 return false;
24676
24677 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
24678 return true;
24679}
24680
24681static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
24682 SDValue WideMask,
24683 unsigned RequiredNumParts) {
24684 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
24685 SmallVector<SDValue, 4> MaskInterleaveOps;
24686 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
24687 MaskInterleaveOps))
24688 return SDValue();
24689
24690 if (MaskInterleaveOps.size() != RequiredNumParts)
24691 return SDValue();
24692
24693 // Make sure the inputs to the vector interleave are identical.
24694 if (!llvm::all_equal(MaskInterleaveOps))
24695 return SDValue();
24696
24697 return MaskInterleaveOps[0];
24698 }
24699
24700 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
24701 return SDValue();
24702
24704 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
24705 "Expected element count divisible by number of parts");
24706 EC = EC.divideCoefficientBy(RequiredNumParts);
24707 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
24708 WideMask->getOperand(0));
24709}
24710
24711static SDValue performInterleavedMaskedStoreCombine(
24713 if (!DCI.isBeforeLegalize())
24714 return SDValue();
24715
24717 SDValue WideValue = MST->getValue();
24718
24719 // Bail out if the stored value has an unexpected number of uses, since we'll
24720 // have to perform manual interleaving and may as well just use normal masked
24721 // stores. Also, discard masked stores that are truncating or indexed.
24722 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
24723 !MST->isSimple() || !MST->getOffset().isUndef())
24724 return SDValue();
24725
24726 SmallVector<SDValue, 4> ValueInterleaveOps;
24727 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
24728 ValueInterleaveOps))
24729 return SDValue();
24730
24731 unsigned NumParts = ValueInterleaveOps.size();
24732 if (NumParts != 2 && NumParts != 4)
24733 return SDValue();
24734
24735 // At the moment we're unlikely to see a fixed-width vector interleave as
24736 // we usually generate shuffles instead.
24737 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
24738 if (!SubVecTy.isScalableVT() ||
24739 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
24740 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
24741 return SDValue();
24742
24743 SDLoc DL(N);
24744 SDValue NarrowMask =
24745 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
24746 if (!NarrowMask)
24747 return SDValue();
24748
24749 const Intrinsic::ID IID =
24750 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
24751 SmallVector<SDValue, 8> NewStOps;
24752 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
24753 NewStOps.append(ValueInterleaveOps);
24754 NewStOps.append({NarrowMask, MST->getBasePtr()});
24755 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
24756}
24757
24758static SDValue performMSTORECombine(SDNode *N,
24760 SelectionDAG &DAG,
24761 const AArch64Subtarget *Subtarget) {
24763 SDValue Value = MST->getValue();
24764 SDValue Mask = MST->getMask();
24765 SDLoc DL(N);
24766
24767 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
24768 return Res;
24769
24770 // If this is a UZP1 followed by a masked store, fold this into a masked
24771 // truncating store. We can do this even if this is already a masked
24772 // truncstore.
24773 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24774 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24775 Value.getValueType().isInteger()) {
24776 Value = Value.getOperand(0);
24777 if (Value.getOpcode() == ISD::BITCAST) {
24778 EVT HalfVT =
24779 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
24780 EVT InVT = Value.getOperand(0).getValueType();
24781
24782 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
24783 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24784 unsigned PgPattern = Mask->getConstantOperandVal(0);
24785
24786 // Ensure we can double the size of the predicate pattern
24787 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24788 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24789 MinSVESize) {
24790 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
24791 PgPattern);
24792 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
24793 MST->getBasePtr(), MST->getOffset(), Mask,
24794 MST->getMemoryVT(), MST->getMemOperand(),
24795 MST->getAddressingMode(),
24796 /*IsTruncating=*/true);
24797 }
24798 }
24799 }
24800 }
24801
24802 if (MST->isTruncatingStore()) {
24803 EVT ValueVT = Value->getValueType(0);
24804 EVT MemVT = MST->getMemoryVT();
24805 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
24806 return SDValue();
24807 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24808 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24809 MST->getOffset(), MST->getMask(),
24810 MST->getMemoryVT(), MST->getMemOperand(),
24811 MST->getAddressingMode(), true);
24812 }
24813 }
24814
24815 return SDValue();
24816}
24817
24818/// \return true if part of the index was folded into the Base.
24819static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24820 SDLoc DL, SelectionDAG &DAG) {
24821 // This function assumes a vector of i64 indices.
24822 EVT IndexVT = Index.getValueType();
24823 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24824 return false;
24825
24826 // Simplify:
24827 // BasePtr = Ptr
24828 // Index = X + splat(Offset)
24829 // ->
24830 // BasePtr = Ptr + Offset * scale.
24831 // Index = X
24832 if (Index.getOpcode() == ISD::ADD) {
24833 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24834 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24835 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24836 Index = Index.getOperand(0);
24837 return true;
24838 }
24839 }
24840
24841 // Simplify:
24842 // BasePtr = Ptr
24843 // Index = (X + splat(Offset)) << splat(Shift)
24844 // ->
24845 // BasePtr = Ptr + (Offset << Shift) * scale)
24846 // Index = X << splat(shift)
24847 if (Index.getOpcode() == ISD::SHL &&
24848 Index.getOperand(0).getOpcode() == ISD::ADD) {
24849 SDValue Add = Index.getOperand(0);
24850 SDValue ShiftOp = Index.getOperand(1);
24851 SDValue OffsetOp = Add.getOperand(1);
24852 if (auto Shift = DAG.getSplatValue(ShiftOp))
24853 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
24854 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
24855 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24856 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24857 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
24858 Add.getOperand(0), ShiftOp);
24859 return true;
24860 }
24861 }
24862
24863 return false;
24864}
24865
24866// Analyse the specified address returning true if a more optimal addressing
24867// mode is available. When returning true all parameters are updated to reflect
24868// their recommended values.
24870 SDValue &BasePtr, SDValue &Index,
24871 SelectionDAG &DAG) {
24872 // Try to iteratively fold parts of the index into the base pointer to
24873 // simplify the index as much as possible.
24874 bool Changed = false;
24875 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
24876 Changed = true;
24877
24878 // Only consider element types that are pointer sized as smaller types can
24879 // be easily promoted.
24880 EVT IndexVT = Index.getValueType();
24881 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24882 return Changed;
24883
24884 // Can indices be trivially shrunk?
24885 EVT DataVT = N->getOperand(1).getValueType();
24886 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24887 // will later be re-extended to 64 bits in legalization
24888 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24889 return Changed;
24890 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24891 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24892 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24893 return true;
24894 }
24895
24896 // Match:
24897 // Index = step(const)
24898 int64_t Stride = 0;
24899 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24900 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24901 }
24902 // Match:
24903 // Index = step(const) << shift(const)
24904 else if (Index.getOpcode() == ISD::SHL &&
24905 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24906 SDValue RHS = Index.getOperand(1);
24907 if (auto *Shift =
24909 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24910 Stride = Step << Shift->getZExtValue();
24911 }
24912 }
24913
24914 // Return early because no supported pattern is found.
24915 if (Stride == 0)
24916 return Changed;
24917
24918 if (Stride < std::numeric_limits<int32_t>::min() ||
24919 Stride > std::numeric_limits<int32_t>::max())
24920 return Changed;
24921
24922 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24923 unsigned MaxVScale =
24925 int64_t LastElementOffset =
24926 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24927
24928 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24929 LastElementOffset > std::numeric_limits<int32_t>::max())
24930 return Changed;
24931
24932 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24933 // Stride does not scale explicitly by 'Scale', because it happens in
24934 // the gather/scatter addressing mode.
24935 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
24936 return true;
24937}
24938
24941 if (!DCI.isBeforeLegalize())
24942 return SDValue();
24944
24945 SDLoc DL(MGS);
24946 SDValue Chain = MGS->getChain();
24947 SDValue Scale = MGS->getScale();
24948 SDValue Index = MGS->getIndex();
24949 SDValue Mask = MGS->getMask();
24950 SDValue BasePtr = MGS->getBasePtr();
24951 ISD::MemIndexType IndexType = MGS->getIndexType();
24952
24953 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24954 return SDValue();
24955
24956 // Here we catch such cases early and change MGATHER's IndexType to allow
24957 // the use of an Index that's more legalisation friendly.
24958 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
24959 SDValue PassThru = MGT->getPassThru();
24960 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24961 return DAG.getMaskedGather(
24962 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
24963 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
24964 }
24965 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24966 SDValue Data = MSC->getValue();
24967 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24968 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24969 DL, Ops, MSC->getMemOperand(), IndexType,
24970 MSC->isTruncatingStore());
24971 }
24972 auto *HG = cast<MaskedHistogramSDNode>(MGS);
24973 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24974 Index, Scale, HG->getIntID()};
24975 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24976 DL, Ops, HG->getMemOperand(), IndexType);
24977}
24978
24979/// Target-specific DAG combine function for NEON load/store intrinsics
24980/// to merge base address updates.
24983 SelectionDAG &DAG) {
24984 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24985 return SDValue();
24986
24987 unsigned AddrOpIdx = N->getNumOperands() - 1;
24988 SDValue Addr = N->getOperand(AddrOpIdx);
24989
24990 // Search for a use of the address operand that is an increment.
24991 for (SDUse &Use : Addr->uses()) {
24992 SDNode *User = Use.getUser();
24993 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24994 continue;
24995
24996 // Check that the add is independent of the load/store. Otherwise, folding
24997 // it would create a cycle.
25000 Visited.insert(Addr.getNode());
25001 Worklist.push_back(N);
25002 Worklist.push_back(User);
25003 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
25004 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25005 continue;
25006
25007 // Find the new opcode for the updating load/store.
25008 bool IsStore = false;
25009 bool IsLaneOp = false;
25010 bool IsDupOp = false;
25011 unsigned NewOpc = 0;
25012 unsigned NumVecs = 0;
25013 unsigned IntNo = N->getConstantOperandVal(1);
25014 switch (IntNo) {
25015 default: llvm_unreachable("unexpected intrinsic for Neon base update");
25016 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
25017 NumVecs = 2; break;
25018 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
25019 NumVecs = 3; break;
25020 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
25021 NumVecs = 4; break;
25022 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
25023 NumVecs = 2; IsStore = true; break;
25024 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
25025 NumVecs = 3; IsStore = true; break;
25026 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
25027 NumVecs = 4; IsStore = true; break;
25028 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
25029 NumVecs = 2; break;
25030 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
25031 NumVecs = 3; break;
25032 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
25033 NumVecs = 4; break;
25034 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
25035 NumVecs = 2; IsStore = true; break;
25036 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
25037 NumVecs = 3; IsStore = true; break;
25038 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
25039 NumVecs = 4; IsStore = true; break;
25040 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
25041 NumVecs = 2; IsDupOp = true; break;
25042 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
25043 NumVecs = 3; IsDupOp = true; break;
25044 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
25045 NumVecs = 4; IsDupOp = true; break;
25046 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
25047 NumVecs = 2; IsLaneOp = true; break;
25048 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
25049 NumVecs = 3; IsLaneOp = true; break;
25050 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
25051 NumVecs = 4; IsLaneOp = true; break;
25052 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
25053 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
25054 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
25055 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
25056 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
25057 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
25058 }
25059
25060 EVT VecTy;
25061 if (IsStore)
25062 VecTy = N->getOperand(2).getValueType();
25063 else
25064 VecTy = N->getValueType(0);
25065
25066 // If the increment is a constant, it must match the memory ref size.
25067 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25068 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25069 uint32_t IncVal = CInc->getZExtValue();
25070 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
25071 if (IsLaneOp || IsDupOp)
25072 NumBytes /= VecTy.getVectorNumElements();
25073 if (IncVal != NumBytes)
25074 continue;
25075 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25076 }
25078 Ops.push_back(N->getOperand(0)); // Incoming chain
25079 // Load lane and store have vector list as input.
25080 if (IsLaneOp || IsStore)
25081 for (unsigned i = 2; i < AddrOpIdx; ++i)
25082 Ops.push_back(N->getOperand(i));
25083 Ops.push_back(Addr); // Base register
25084 Ops.push_back(Inc);
25085
25086 // Return Types.
25087 EVT Tys[6];
25088 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
25089 unsigned n;
25090 for (n = 0; n < NumResultVecs; ++n)
25091 Tys[n] = VecTy;
25092 Tys[n++] = MVT::i64; // Type of write back register
25093 Tys[n] = MVT::Other; // Type of the chain
25094 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
25095
25097 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
25098 MemInt->getMemoryVT(),
25099 MemInt->getMemOperand());
25100
25101 // Update the uses.
25102 std::vector<SDValue> NewResults;
25103 for (unsigned i = 0; i < NumResultVecs; ++i) {
25104 NewResults.push_back(SDValue(UpdN.getNode(), i));
25105 }
25106 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
25107 DCI.CombineTo(N, NewResults);
25108 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
25109
25110 break;
25111 }
25112 return SDValue();
25113}
25114
25115// Checks to see if the value is the prescribed width and returns information
25116// about its extension mode.
25117static
25118bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
25119 ExtType = ISD::NON_EXTLOAD;
25120 switch(V.getNode()->getOpcode()) {
25121 default:
25122 return false;
25123 case ISD::LOAD: {
25124 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
25125 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
25126 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25127 ExtType = LoadNode->getExtensionType();
25128 return true;
25129 }
25130 return false;
25131 }
25132 case ISD::AssertSext: {
25133 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25134 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25135 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25136 ExtType = ISD::SEXTLOAD;
25137 return true;
25138 }
25139 return false;
25140 }
25141 case ISD::AssertZext: {
25142 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25143 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25144 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25145 ExtType = ISD::ZEXTLOAD;
25146 return true;
25147 }
25148 return false;
25149 }
25150 case ISD::Constant:
25151 case ISD::TargetConstant: {
25152 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25153 1LL << (width - 1);
25154 }
25155 }
25156
25157 return true;
25158}
25159
25160// This function does a whole lot of voodoo to determine if the tests are
25161// equivalent without and with a mask. Essentially what happens is that given a
25162// DAG resembling:
25163//
25164// +-------------+ +-------------+ +-------------+ +-------------+
25165// | Input | | AddConstant | | CompConstant| | CC |
25166// +-------------+ +-------------+ +-------------+ +-------------+
25167// | | | |
25168// V V | +----------+
25169// +-------------+ +----+ | |
25170// | ADD | |0xff| | |
25171// +-------------+ +----+ | |
25172// | | | |
25173// V V | |
25174// +-------------+ | |
25175// | AND | | |
25176// +-------------+ | |
25177// | | |
25178// +-----+ | |
25179// | | |
25180// V V V
25181// +-------------+
25182// | CMP |
25183// +-------------+
25184//
25185// The AND node may be safely removed for some combinations of inputs. In
25186// particular we need to take into account the extension type of the Input,
25187// the exact values of AddConstant, CompConstant, and CC, along with the nominal
25188// width of the input (this can work for any width inputs, the above graph is
25189// specific to 8 bits.
25190//
25191// The specific equations were worked out by generating output tables for each
25192// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
25193// problem was simplified by working with 4 bit inputs, which means we only
25194// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
25195// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
25196// patterns present in both extensions (0,7). For every distinct set of
25197// AddConstant and CompConstants bit patterns we can consider the masked and
25198// unmasked versions to be equivalent if the result of this function is true for
25199// all 16 distinct bit patterns of for the current extension type of Input (w0).
25200//
25201// sub w8, w0, w1
25202// and w10, w8, #0x0f
25203// cmp w8, w2
25204// cset w9, AArch64CC
25205// cmp w10, w2
25206// cset w11, AArch64CC
25207// cmp w9, w11
25208// cset w0, eq
25209// ret
25210//
25211// Since the above function shows when the outputs are equivalent it defines
25212// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
25213// would be expensive to run during compiles. The equations below were written
25214// in a test harness that confirmed they gave equivalent outputs to the above
25215// for all inputs function, so they can be used determine if the removal is
25216// legal instead.
25217//
25218// isEquivalentMaskless() is the code for testing if the AND can be removed
25219// factored out of the DAG recognition as the DAG can take several forms.
25220
25221static bool isEquivalentMaskless(unsigned CC, unsigned width,
25222 ISD::LoadExtType ExtType, int AddConstant,
25223 int CompConstant) {
25224 // By being careful about our equations and only writing the in term
25225 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
25226 // make them generally applicable to all bit widths.
25227 int MaxUInt = (1 << width);
25228
25229 // For the purposes of these comparisons sign extending the type is
25230 // equivalent to zero extending the add and displacing it by half the integer
25231 // width. Provided we are careful and make sure our equations are valid over
25232 // the whole range we can just adjust the input and avoid writing equations
25233 // for sign extended inputs.
25234 if (ExtType == ISD::SEXTLOAD)
25235 AddConstant -= (1 << (width-1));
25236
25237 switch(CC) {
25238 case AArch64CC::LE:
25239 case AArch64CC::GT:
25240 if ((AddConstant == 0) ||
25241 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
25242 (AddConstant >= 0 && CompConstant < 0) ||
25243 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
25244 return true;
25245 break;
25246 case AArch64CC::LT:
25247 case AArch64CC::GE:
25248 if ((AddConstant == 0) ||
25249 (AddConstant >= 0 && CompConstant <= 0) ||
25250 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
25251 return true;
25252 break;
25253 case AArch64CC::HI:
25254 case AArch64CC::LS:
25255 if ((AddConstant >= 0 && CompConstant < 0) ||
25256 (AddConstant <= 0 && CompConstant >= -1 &&
25257 CompConstant < AddConstant + MaxUInt))
25258 return true;
25259 break;
25260 case AArch64CC::PL:
25261 case AArch64CC::MI:
25262 if ((AddConstant == 0) ||
25263 (AddConstant > 0 && CompConstant <= 0) ||
25264 (AddConstant < 0 && CompConstant <= AddConstant))
25265 return true;
25266 break;
25267 case AArch64CC::LO:
25268 case AArch64CC::HS:
25269 if ((AddConstant >= 0 && CompConstant <= 0) ||
25270 (AddConstant <= 0 && CompConstant >= 0 &&
25271 CompConstant <= AddConstant + MaxUInt))
25272 return true;
25273 break;
25274 case AArch64CC::EQ:
25275 case AArch64CC::NE:
25276 if ((AddConstant > 0 && CompConstant < 0) ||
25277 (AddConstant < 0 && CompConstant >= 0 &&
25278 CompConstant < AddConstant + MaxUInt) ||
25279 (AddConstant >= 0 && CompConstant >= 0 &&
25280 CompConstant >= AddConstant) ||
25281 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
25282 return true;
25283 break;
25284 case AArch64CC::VS:
25285 case AArch64CC::VC:
25286 case AArch64CC::AL:
25287 case AArch64CC::NV:
25288 return true;
25289 case AArch64CC::Invalid:
25290 break;
25291 }
25292
25293 return false;
25294}
25295
25296// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
25297// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
25299 SDNode *AndNode, SelectionDAG &DAG,
25300 unsigned CCIndex, unsigned CmpIndex,
25301 unsigned CC) {
25302 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
25303 if (!SubsC)
25304 return SDValue();
25305
25306 APInt SubsAP = SubsC->getAPIntValue();
25307 if (CC == AArch64CC::HI) {
25308 if (!SubsAP.isMask())
25309 return SDValue();
25310 } else if (CC == AArch64CC::LO) {
25311 if (!SubsAP.isPowerOf2())
25312 return SDValue();
25313 } else
25314 return SDValue();
25315
25317 if (!AndC)
25318 return SDValue();
25319
25320 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
25321
25322 SDLoc DL(N);
25323 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
25324 SDValue ANDS = DAG.getNode(
25325 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
25326 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
25327 SDValue AArch64_CC =
25329 N->getOperand(CCIndex)->getValueType(0));
25330
25331 // For now, only performCSELCombine and performBRCONDCombine call this
25332 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
25333 // operands. So just init the ops direct to simplify the code. If we have some
25334 // other case with different CCIndex, CmpIndex, we need to use for loop to
25335 // rewrite the code here.
25336 // TODO: Do we need to assert number of operand is 4 here?
25337 assert((CCIndex == 2 && CmpIndex == 3) &&
25338 "Expected CCIndex to be 2 and CmpIndex to be 3.");
25339 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
25340 ANDS.getValue(1)};
25341 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
25342}
25343
25344static
25347 SelectionDAG &DAG, unsigned CCIndex,
25348 unsigned CmpIndex) {
25349 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
25350 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
25351 unsigned CondOpcode = SubsNode->getOpcode();
25352
25353 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
25354 !SubsNode->hasOneUse())
25355 return SDValue();
25356
25357 // There is a SUBS feeding this condition. Is it fed by a mask we can
25358 // use?
25359
25360 SDNode *AndNode = SubsNode->getOperand(0).getNode();
25361 unsigned MaskBits = 0;
25362
25363 if (AndNode->getOpcode() != ISD::AND)
25364 return SDValue();
25365
25366 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
25367 CmpIndex, CC))
25368 return Val;
25369
25370 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
25371 uint32_t CNV = CN->getZExtValue();
25372 if (CNV == 255)
25373 MaskBits = 8;
25374 else if (CNV == 65535)
25375 MaskBits = 16;
25376 }
25377
25378 if (!MaskBits)
25379 return SDValue();
25380
25381 SDValue AddValue = AndNode->getOperand(0);
25382
25383 if (AddValue.getOpcode() != ISD::ADD)
25384 return SDValue();
25385
25386 // The basic dag structure is correct, grab the inputs and validate them.
25387
25388 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
25389 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
25390 SDValue SubsInputValue = SubsNode->getOperand(1);
25391
25392 // The mask is present and the provenance of all the values is a smaller type,
25393 // lets see if the mask is superfluous.
25394
25395 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
25396 !isa<ConstantSDNode>(SubsInputValue.getNode()))
25397 return SDValue();
25398
25399 ISD::LoadExtType ExtType;
25400
25401 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
25402 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
25403 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
25404 return SDValue();
25405
25406 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
25407 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
25408 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
25409 return SDValue();
25410
25411 // The AND is not necessary, remove it.
25412
25413 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
25414 SubsNode->getValueType(1));
25415 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
25416
25417 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
25418 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
25419
25420 return SDValue(N, 0);
25421}
25422
25423// Optimize compare with zero and branch.
25426 SelectionDAG &DAG) {
25428 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
25429 // will not be produced, as they are conditional branch instructions that do
25430 // not set flags.
25431 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
25432 return SDValue();
25433
25434 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
25435 N = NV.getNode();
25436 SDValue Chain = N->getOperand(0);
25437 SDValue Dest = N->getOperand(1);
25438 SDValue CCVal = N->getOperand(2);
25439 SDValue Cmp = N->getOperand(3);
25440
25441 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
25442 unsigned CC = CCVal->getAsZExtVal();
25443 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
25444 return SDValue();
25445
25446 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
25447 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
25448 SDValue CSel = Cmp.getOperand(0);
25449 auto CSelCC = getCSETCondCode(CSel);
25450 if (CSelCC) {
25451 SDLoc DL(N);
25452 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
25453 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
25454 CSel.getOperand(3));
25455 }
25456 }
25457
25458 unsigned CmpOpc = Cmp.getOpcode();
25459 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
25460 return SDValue();
25461
25462 // Only attempt folding if there is only one use of the flag and no use of the
25463 // value.
25464 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
25465 return SDValue();
25466
25467 SDValue LHS = Cmp.getOperand(0);
25468 SDValue RHS = Cmp.getOperand(1);
25469
25470 assert(LHS.getValueType() == RHS.getValueType() &&
25471 "Expected the value type to be the same for both operands!");
25472 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
25473 return SDValue();
25474
25475 if (isNullConstant(LHS))
25476 std::swap(LHS, RHS);
25477
25478 if (!isNullConstant(RHS))
25479 return SDValue();
25480
25481 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
25482 LHS.getOpcode() == ISD::SRL)
25483 return SDValue();
25484
25485 // Fold the compare into the branch instruction.
25486 SDValue BR;
25487 if (CC == AArch64CC::EQ)
25488 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25489 else
25490 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25491
25492 // Do not add new nodes to DAG combiner worklist.
25493 DCI.CombineTo(N, BR, false);
25494
25495 return SDValue();
25496}
25497
25499 unsigned CC = N->getConstantOperandVal(2);
25500 SDValue SUBS = N->getOperand(3);
25501 SDValue Zero, CTTZ;
25502
25503 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
25504 Zero = N->getOperand(0);
25505 CTTZ = N->getOperand(1);
25506 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
25507 Zero = N->getOperand(1);
25508 CTTZ = N->getOperand(0);
25509 } else
25510 return SDValue();
25511
25512 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
25513 (CTTZ.getOpcode() == ISD::TRUNCATE &&
25514 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
25515 return SDValue();
25516
25517 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
25518 "Illegal type in CTTZ folding");
25519
25520 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
25521 return SDValue();
25522
25523 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
25524 ? CTTZ.getOperand(0).getOperand(0)
25525 : CTTZ.getOperand(0);
25526
25527 if (X != SUBS.getOperand(0))
25528 return SDValue();
25529
25530 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
25531 ? CTTZ.getOperand(0).getValueSizeInBits()
25532 : CTTZ.getValueSizeInBits();
25533 SDValue BitWidthMinusOne =
25534 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
25535 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
25536 BitWidthMinusOne);
25537}
25538
25539// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
25540// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
25541// Where x and y are constants and x != y
25542
25543// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
25544// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
25545// Where x and y are constants and x != y
25547 SDValue L = Op->getOperand(0);
25548 SDValue R = Op->getOperand(1);
25549 AArch64CC::CondCode OpCC =
25550 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25551
25552 SDValue OpCmp = Op->getOperand(3);
25553 if (!isCMP(OpCmp))
25554 return SDValue();
25555
25556 SDValue CmpLHS = OpCmp.getOperand(0);
25557 SDValue CmpRHS = OpCmp.getOperand(1);
25558
25559 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
25560 std::swap(CmpLHS, CmpRHS);
25561 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
25562 return SDValue();
25563
25564 SDValue X = CmpLHS->getOperand(0);
25565 SDValue Y = CmpLHS->getOperand(1);
25566 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
25567 return SDValue();
25568 }
25569
25570 // If one of the constant is opaque constant, x,y sdnode is still different
25571 // but the real value maybe the same. So check APInt here to make sure the
25572 // code is correct.
25575 if (CX->getAPIntValue() == CY->getAPIntValue())
25576 return SDValue();
25577
25579 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
25580 SDValue Cond = CmpLHS->getOperand(3);
25581
25582 if (CmpRHS == Y)
25584 else if (CmpRHS != X)
25585 return SDValue();
25586
25587 if (OpCC == AArch64CC::NE)
25589 else if (OpCC != AArch64CC::EQ)
25590 return SDValue();
25591
25592 SDLoc DL(Op);
25593 EVT VT = Op->getValueType(0);
25594
25595 SDValue CCValue = getCondCode(DAG, CC);
25596 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
25597}
25598
25599// Reassociate the true/false expressions of a CSEL instruction to obtain a
25600// common subexpression with the comparison instruction. For example, change
25601// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
25602// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
25603// subexpression.
25605 SDValue SubsNode = N->getOperand(3);
25606 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
25607 return SDValue();
25608
25609 SDValue CmpOpToMatch = SubsNode.getOperand(1);
25610 SDValue CmpOpOther = SubsNode.getOperand(0);
25611 EVT VT = N->getValueType(0);
25612
25613 unsigned ExpectedOpcode;
25614 SDValue ExpectedOp;
25615 SDValue SubsOp;
25616 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
25617 if (CmpOpConst) {
25618 ExpectedOpcode = ISD::ADD;
25619 ExpectedOp =
25620 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25621 CmpOpConst->getValueType(0));
25622 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25623 CmpOpConst->getValueType(0));
25624 } else {
25625 ExpectedOpcode = ISD::SUB;
25626 ExpectedOp = CmpOpToMatch;
25627 SubsOp = CmpOpToMatch;
25628 }
25629
25630 // Get the operand that can be reassociated with the SUBS instruction.
25631 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
25632 if (Op.getOpcode() != ExpectedOpcode)
25633 return SDValue();
25634 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
25635 !Op.getOperand(0).hasOneUse())
25636 return SDValue();
25637 SDValue X = Op.getOperand(0).getOperand(0);
25638 SDValue Y = Op.getOperand(0).getOperand(1);
25639 if (X != CmpOpOther)
25640 std::swap(X, Y);
25641 if (X != CmpOpOther)
25642 return SDValue();
25643 if (ExpectedOp != Op.getOperand(1))
25644 return SDValue();
25645 return Y;
25646 };
25647
25648 // Try the reassociation using the given constant and condition code.
25649 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
25650 SDValue SubsOp) {
25651 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
25652 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
25653 if (!TReassocOp && !FReassocOp)
25654 return SDValue();
25655
25656 SDValue NewCmp =
25657 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
25658 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
25659
25660 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
25661 if (!ReassocOp)
25662 return N->getOperand(OpNum);
25663 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
25664 NewCmp.getValue(0), ReassocOp);
25665 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
25666 return Res;
25667 };
25668
25669 SDValue TValReassoc = Reassociate(TReassocOp, 0);
25670 SDValue FValReassoc = Reassociate(FReassocOp, 1);
25671 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
25672 getCondCode(DAG, NewCC), NewCmp.getValue(1));
25673 };
25674
25675 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25676
25677 // First, try to eliminate the compare instruction by searching for a
25678 // subtraction with the same constant.
25679 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
25680 return R;
25681
25682 if (!CmpOpConst) {
25683 // Try again with the operands of the SUBS instruction and the condition
25684 // swapped. Due to canonicalization, this only helps for non-constant
25685 // operands of the SUBS instruction.
25686 std::swap(CmpOpToMatch, CmpOpOther);
25687 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
25688 return R;
25689 return SDValue();
25690 }
25691
25692 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
25693 return SDValue();
25694
25695 // Next, search for a subtraction with a slightly different constant. By
25696 // adjusting the condition code, we can still eliminate the compare
25697 // instruction. Adjusting the constant is only valid if it does not result
25698 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
25699 // Since such comparisons are trivially true/false, we should not encounter
25700 // them here but check for them nevertheless to be on the safe side.
25701 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
25702 AArch64CC::CondCode NewCC) {
25703 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
25704 CmpOpConst->getValueType(0));
25705 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
25706 CmpOpConst->getValueType(0));
25707 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25708 };
25709 switch (CC) {
25710 case AArch64CC::EQ:
25711 case AArch64CC::LS:
25712 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25713 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25714 case AArch64CC::NE:
25715 case AArch64CC::HI:
25716 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25717 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25718 case AArch64CC::LO:
25719 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25720 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25721 case AArch64CC::HS:
25722 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25723 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25724 case AArch64CC::LT:
25725 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25726 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25727 case AArch64CC::LE:
25728 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25729 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25730 case AArch64CC::GT:
25731 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25732 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25733 case AArch64CC::GE:
25734 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25735 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25736 default:
25737 return SDValue();
25738 }
25739}
25740
25742 AArch64CC::CondCode OpCC =
25743 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25744
25745 if (OpCC != AArch64CC::NE)
25746 return SDValue();
25747
25748 SDValue PTest = Op->getOperand(3);
25749 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25750 return SDValue();
25751
25752 SDValue TruePred = PTest.getOperand(0);
25753 SDValue AnyPred = PTest.getOperand(1);
25754
25755 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25756 TruePred = TruePred.getOperand(0);
25757
25758 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25759 AnyPred = AnyPred.getOperand(0);
25760
25761 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
25762 return SDValue();
25763
25764 SDValue LastB = Op->getOperand(0);
25765 SDValue Default = Op->getOperand(1);
25766
25767 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
25768 return SDValue();
25769
25770 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
25771 AnyPred, Default, LastB.getOperand(1));
25772}
25773
25774// Optimize CSEL instructions
25777 SelectionDAG &DAG) {
25778 // CSEL x, x, cc -> x
25779 if (N->getOperand(0) == N->getOperand(1))
25780 return N->getOperand(0);
25781
25782 if (SDValue R = foldCSELOfCSEL(N, DAG))
25783 return R;
25784
25785 // Try to reassociate the true/false expressions so that we can do CSE with
25786 // a SUBS instruction used to perform the comparison.
25788 return R;
25789
25790 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25791 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25792 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25793 return Folded;
25794
25795 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25796 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25797 SDValue Cond = N->getOperand(3);
25798 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25799 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
25800 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25801 {Cond.getOperand(1), Cond.getOperand(0)}) &&
25802 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25803 {Cond.getOperand(0), Cond.getOperand(1)}) &&
25804 !isNullConstant(Cond.getOperand(1))) {
25805 AArch64CC::CondCode OldCond =
25806 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25807 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
25808 if (NewCond != AArch64CC::AL) {
25809 SDLoc DL(N);
25810 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25811 Cond.getOperand(1), Cond.getOperand(0));
25812 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
25813 N->getOperand(1), getCondCode(DAG, NewCond),
25814 Sub.getValue(1));
25815 }
25816 }
25817
25818 // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
25819 // use overflow flags, to avoid the comparison with zero. In case of success,
25820 // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
25821 // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
25822 // nodes with their SUBS equivalent as is already done for other flag-setting
25823 // operators, in which case doing the replacement here becomes redundant.
25824 if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
25825 isNullConstant(Cond.getOperand(1))) {
25826 SDValue Sub = Cond.getOperand(0);
25828 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25829 if (Sub.getOpcode() == ISD::SUB &&
25830 (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
25831 CC == AArch64CC::PL)) {
25832 SDLoc DL(N);
25833 SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25834 Sub.getOperand(0), Sub.getOperand(1));
25835 DCI.CombineTo(Sub.getNode(), Subs);
25836 DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
25837 return SDValue(N, 0);
25838 }
25839 }
25840
25841 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
25842 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
25843 return CondLast;
25844
25845 return performCONDCombine(N, DCI, DAG, 2, 3);
25846}
25847
25848// Try to re-use an already extended operand of a vector SetCC feeding a
25849// extended select. Doing so avoids requiring another full extension of the
25850// SET_CC result when lowering the select.
25852 EVT Op0MVT = Op->getOperand(0).getValueType();
25853 if (!Op0MVT.isVector() || Op->use_empty())
25854 return SDValue();
25855
25856 // Make sure that all uses of Op are VSELECTs with result matching types where
25857 // the result type has a larger element type than the SetCC operand.
25858 SDNode *FirstUse = *Op->user_begin();
25859 if (FirstUse->getOpcode() != ISD::VSELECT)
25860 return SDValue();
25861 EVT UseMVT = FirstUse->getValueType(0);
25862 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
25863 return SDValue();
25864 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
25865 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
25866 }))
25867 return SDValue();
25868
25869 APInt V;
25870 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
25871 return SDValue();
25872
25873 SDLoc DL(Op);
25874 SDValue Op0ExtV;
25875 SDValue Op1ExtV;
25876 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
25877 // Check if the first operand of the SET_CC is already extended. If it is,
25878 // split the SET_CC and re-use the extended version of the operand.
25879 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
25880 Op->getOperand(0));
25881 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
25882 Op->getOperand(0));
25883 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25884 Op0ExtV = SDValue(Op0SExt, 0);
25885 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
25886 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25887 Op0ExtV = SDValue(Op0ZExt, 0);
25888 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
25889 } else
25890 return SDValue();
25891
25892 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
25893 Op0ExtV, Op1ExtV, Op->getOperand(2));
25894}
25895
25896static SDValue
25898 SelectionDAG &DAG) {
25899 SDValue Vec = N->getOperand(0);
25900 if (DCI.isBeforeLegalize() &&
25901 Vec.getValueType().getVectorElementType() == MVT::i1 &&
25904 SDLoc DL(N);
25905 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
25906 DAG);
25907 }
25908
25909 return SDValue();
25910}
25911
25914 SelectionDAG &DAG) {
25915 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25916 SDValue LHS = N->getOperand(0);
25917 SDValue RHS = N->getOperand(1);
25918 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
25919 SDLoc DL(N);
25920 EVT VT = N->getValueType(0);
25921
25922 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
25923 return V;
25924
25925 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
25926 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
25927 LHS->getOpcode() == AArch64ISD::CSEL &&
25928 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
25929 LHS->hasOneUse()) {
25930 // Invert CSEL's condition.
25931 auto OldCond =
25932 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
25933 auto NewCond = getInvertedCondCode(OldCond);
25934
25935 // csel 0, 1, !cond, X
25936 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
25937 LHS.getOperand(0), LHS.getOperand(1),
25938 getCondCode(DAG, NewCond), LHS.getOperand(3));
25939 return DAG.getZExtOrTrunc(CSEL, DL, VT);
25940 }
25941
25942 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
25943 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
25944 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
25945 LHS->hasOneUse()) {
25946 EVT TstVT = LHS->getValueType(0);
25947 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
25948 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
25949 // this pattern will get better opt in emitComparison
25950 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
25951 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
25952 DAG.getSignedConstant(TstImm, DL, TstVT));
25953 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
25954 }
25955 }
25956
25957 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25958 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25959 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25960 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25961 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25962 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25964 LHS->getOpcode() == ISD::BITCAST) {
25965 EVT ToVT = LHS->getValueType(0);
25966 EVT FromVT = LHS->getOperand(0).getValueType();
25967 if (FromVT.isFixedLengthVector() &&
25968 FromVT.getVectorElementType() == MVT::i1) {
25969 bool IsNull = isNullConstant(RHS);
25970 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
25971 DL, MVT::i1, LHS->getOperand(0));
25972 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
25973 LHS);
25974 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25975 }
25976 }
25977
25978 // Try to perform the memcmp when the result is tested for [in]equality with 0
25979 if (SDValue V = performOrXorChainCombine(N, DAG))
25980 return V;
25981
25982 EVT CmpVT = LHS.getValueType();
25983
25984 // NOTE: This exists as a combine only because it proved too awkward to match
25985 // splat(1) across all the NEON types during isel.
25986 APInt SplatLHSVal;
25987 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
25988 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
25989 SplatLHSVal.isOne())
25990 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
25991
25992 return SDValue();
25993}
25994
25995// Replace a flag-setting operator (eg ANDS) with the generic version
25996// (eg AND) if the flag is unused.
25999 unsigned GenericOpcode) {
26000 SDLoc DL(N);
26001 SDValue LHS = N->getOperand(0);
26002 SDValue RHS = N->getOperand(1);
26003 EVT VT = N->getValueType(0);
26004
26005 // If the flag result isn't used, convert back to a generic opcode.
26006 if (!N->hasAnyUseOfValue(1)) {
26007 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
26008 return DCI.CombineTo(N, Res, SDValue(N, 1));
26009 }
26010
26011 // Combine identical generic nodes into this node, re-using the result.
26012 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
26013 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
26014 DCI.CombineTo(Generic, SDValue(N, 0));
26015
26016 return SDValue();
26017}
26018
26020 // setcc_merge_zero pred
26021 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
26022 // => extract_subvector (inner setcc_merge_zero)
26023 SDValue Pred = N->getOperand(0);
26024 SDValue LHS = N->getOperand(1);
26025 SDValue RHS = N->getOperand(2);
26026 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26027
26028 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
26029 LHS->getOpcode() != ISD::SIGN_EXTEND)
26030 return SDValue();
26031
26032 SDValue Extract = LHS->getOperand(0);
26033 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26034 Extract->getValueType(0) != N->getValueType(0) ||
26035 Extract->getConstantOperandVal(1) != 0)
26036 return SDValue();
26037
26038 SDValue InnerSetCC = Extract->getOperand(0);
26039 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
26040 return SDValue();
26041
26042 // By this point we've effectively got
26043 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
26044 // lanes are already zero then the trunc(sext()) sequence is redundant and we
26045 // can operate on A directly.
26046 SDValue InnerPred = InnerSetCC.getOperand(0);
26047 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
26048 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
26049 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
26050 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
26051 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
26052 return Extract;
26053
26054 return SDValue();
26055}
26056
26057static bool isSignExtInReg(const SDValue &V) {
26058 if (V.getOpcode() != AArch64ISD::VASHR ||
26059 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
26060 return false;
26061
26062 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
26063 unsigned ShiftAmtR = V.getConstantOperandVal(1);
26064 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
26065 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
26066}
26067
26068static SDValue
26070 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26071 "Unexpected opcode!");
26072
26073 SelectionDAG &DAG = DCI.DAG;
26074 SDValue Pred = N->getOperand(0);
26075 SDValue LHS = N->getOperand(1);
26076 SDValue RHS = N->getOperand(2);
26077 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26078
26079 if (SDValue V = performSetCCPunpkCombine(N, DAG))
26080 return V;
26081
26082 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26083 LHS->getOpcode() == ISD::SIGN_EXTEND &&
26084 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
26085 // setcc_merge_zero(
26086 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
26087 // => setcc_merge_zero(pred, ...)
26088 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26089 LHS->getOperand(0)->getOperand(0) == Pred)
26090 return LHS->getOperand(0);
26091
26092 // setcc_merge_zero(
26093 // all_active, extend(nxvNi1 ...), != splat(0))
26094 // -> nxvNi1 ...
26095 if (isAllActivePredicate(DAG, Pred))
26096 return LHS->getOperand(0);
26097
26098 // setcc_merge_zero(
26099 // pred, extend(nxvNi1 ...), != splat(0))
26100 // -> nxvNi1 and(pred, ...)
26101 if (DCI.isAfterLegalizeDAG())
26102 // Do this after legalization to allow more folds on setcc_merge_zero
26103 // to be recognized.
26104 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
26105 LHS->getOperand(0), Pred);
26106 }
26107
26108 // setcc_merge_zero(
26109 // pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
26110 // => setcc_merge_zero(
26111 // pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
26112 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26113 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
26114 SDValue L0 = LHS->getOperand(0);
26115 SDValue L1 = LHS->getOperand(1);
26116 SDValue L2 = LHS->getOperand(2);
26117
26118 if (L0.getOpcode() == ISD::UNDEF && isNullConstant(L2) &&
26119 isSignExtInReg(L1)) {
26120 SDLoc DL(N);
26121 SDValue Shl = L1.getOperand(0);
26123 LHS.getValueType(), L0, Shl, L2);
26124 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
26125 Pred, NewLHS, RHS, N->getOperand(3));
26126 }
26127 }
26128
26129 return SDValue();
26130}
26131
26132// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
26133// as well as whether the test should be inverted. This code is required to
26134// catch these cases (as opposed to standard dag combines) because
26135// AArch64ISD::TBZ is matched during legalization.
26136static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
26137 SelectionDAG &DAG) {
26138
26139 if (!Op->hasOneUse())
26140 return Op;
26141
26142 // We don't handle undef/constant-fold cases below, as they should have
26143 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
26144 // etc.)
26145
26146 // (tbz (trunc x), b) -> (tbz x, b)
26147 // This case is just here to enable more of the below cases to be caught.
26148 if (Op->getOpcode() == ISD::TRUNCATE &&
26149 Bit < Op->getValueType(0).getSizeInBits()) {
26150 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26151 }
26152
26153 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
26154 if (Op->getOpcode() == ISD::ANY_EXTEND &&
26155 Bit < Op->getOperand(0).getValueSizeInBits()) {
26156 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26157 }
26158
26159 if (Op->getNumOperands() != 2)
26160 return Op;
26161
26162 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
26163 if (!C)
26164 return Op;
26165
26166 switch (Op->getOpcode()) {
26167 default:
26168 return Op;
26169
26170 // (tbz (and x, m), b) -> (tbz x, b)
26171 case ISD::AND:
26172 if ((C->getZExtValue() >> Bit) & 1)
26173 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26174 return Op;
26175
26176 // (tbz (shl x, c), b) -> (tbz x, b-c)
26177 case ISD::SHL:
26178 if (C->getZExtValue() <= Bit &&
26179 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26180 Bit = Bit - C->getZExtValue();
26181 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26182 }
26183 return Op;
26184
26185 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
26186 case ISD::SRA:
26187 Bit = Bit + C->getZExtValue();
26188 if (Bit >= Op->getValueType(0).getSizeInBits())
26189 Bit = Op->getValueType(0).getSizeInBits() - 1;
26190 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26191
26192 // (tbz (srl x, c), b) -> (tbz x, b+c)
26193 case ISD::SRL:
26194 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26195 Bit = Bit + C->getZExtValue();
26196 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26197 }
26198 return Op;
26199
26200 // (tbz (xor x, -1), b) -> (tbnz x, b)
26201 case ISD::XOR:
26202 if ((C->getZExtValue() >> Bit) & 1)
26203 Invert = !Invert;
26204 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26205 }
26206}
26207
26208// Optimize test single bit zero/non-zero and branch.
26211 SelectionDAG &DAG) {
26212 unsigned Bit = N->getConstantOperandVal(2);
26213 bool Invert = false;
26214 SDValue TestSrc = N->getOperand(1);
26215 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
26216
26217 if (TestSrc == NewTestSrc)
26218 return SDValue();
26219
26220 unsigned NewOpc = N->getOpcode();
26221 if (Invert) {
26222 if (NewOpc == AArch64ISD::TBZ)
26223 NewOpc = AArch64ISD::TBNZ;
26224 else {
26225 assert(NewOpc == AArch64ISD::TBNZ);
26226 NewOpc = AArch64ISD::TBZ;
26227 }
26228 }
26229
26230 SDLoc DL(N);
26231 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
26232 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
26233}
26234
26235// Swap vselect operands where it may allow a predicated operation to achieve
26236// the `sel`.
26237//
26238// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
26239// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
26241 auto SelectA = N->getOperand(1);
26242 auto SelectB = N->getOperand(2);
26243 auto NTy = N->getValueType(0);
26244
26245 if (!NTy.isScalableVector())
26246 return SDValue();
26247 SDValue SetCC = N->getOperand(0);
26248 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
26249 return SDValue();
26250
26251 switch (SelectB.getOpcode()) {
26252 default:
26253 return SDValue();
26254 case ISD::FMUL:
26255 case ISD::FSUB:
26256 case ISD::FADD:
26257 break;
26258 }
26259 if (SelectA != SelectB.getOperand(0))
26260 return SDValue();
26261
26262 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
26263 ISD::CondCode InverseCC =
26265 auto InverseSetCC =
26266 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
26267 SetCC.getOperand(1), InverseCC);
26268
26269 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
26270 {InverseSetCC, SelectB, SelectA});
26271}
26272
26273// vselect (v1i1 setcc) ->
26274// vselect (v1iXX setcc) (XX is the size of the compared operand type)
26275// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
26276// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
26277// such VSELECT.
26279 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
26280 return SwapResult;
26281
26282 SDValue N0 = N->getOperand(0);
26283 SDValue IfTrue = N->getOperand(1);
26284 SDValue IfFalse = N->getOperand(2);
26285 EVT ResVT = N->getValueType(0);
26286 EVT CCVT = N0.getValueType();
26287
26288 if (isAllActivePredicate(DAG, N0))
26289 return N->getOperand(1);
26290
26291 if (isAllInactivePredicate(N0))
26292 return N->getOperand(2);
26293
26294 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
26295 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
26296 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
26297 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
26298 // -> merge_pasthru_op A, B,{Bn,} C
26299 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
26300 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
26301 IfTrue->getOperand(0) == N0) {
26303 Ops[0] = N0;
26304 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
26305
26306 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
26307 }
26308 }
26309
26310 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
26311 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
26312 // supported types.
26313 SDValue SetCC = N->getOperand(0);
26314 if (SetCC.getOpcode() == ISD::SETCC &&
26315 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
26316 SDValue CmpLHS = SetCC.getOperand(0);
26317 EVT VT = CmpLHS.getValueType();
26318 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
26319 SDNode *SplatLHS = N->getOperand(1).getNode();
26320 SDNode *SplatRHS = N->getOperand(2).getNode();
26321 APInt SplatLHSVal;
26322 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
26323 VT.isSimple() &&
26324 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
26325 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
26326 VT.getSimpleVT().SimpleTy) &&
26327 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
26328 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
26330 unsigned NumElts = VT.getVectorNumElements();
26332 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
26333 VT.getScalarType()));
26334 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
26335
26336 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
26337 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
26338 return Or;
26339 }
26340 }
26341
26342 EVT CmpVT = N0.getOperand(0).getValueType();
26343 if (N0.getOpcode() != ISD::SETCC ||
26345 CCVT.getVectorElementType() != MVT::i1 ||
26347 return SDValue();
26348
26349 // Only combine when the result type is of the same size as the compared
26350 // operands.
26351 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
26352 return SDValue();
26353
26354 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
26355 N0.getOperand(0), N0.getOperand(1),
26356 cast<CondCodeSDNode>(N0.getOperand(2))->get());
26357 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
26358 IfTrue, IfFalse);
26359}
26360
26361/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
26362/// the compare-mask instructions rather than going via NZCV, even if LHS and
26363/// RHS are really scalar. This replaces any scalar setcc in the above pattern
26364/// with a vector one followed by a DUP shuffle on the result.
26367 SelectionDAG &DAG = DCI.DAG;
26368 SDValue N0 = N->getOperand(0);
26369 EVT ResVT = N->getValueType(0);
26370
26371 if (N0.getOpcode() != ISD::SETCC)
26372 return SDValue();
26373
26374 if (ResVT.isScalableVT())
26375 return SDValue();
26376
26377 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
26378 // scalar SetCCResultType. We also don't expect vectors, because we assume
26379 // that selects fed by vector SETCCs are canonicalized to VSELECT.
26380 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
26381 "Scalar-SETCC feeding SELECT has unexpected result type!");
26382
26383 // If NumMaskElts == 0, the comparison is larger than select result. The
26384 // largest real NEON comparison is 64-bits per lane, which means the result is
26385 // at most 32-bits and an illegal vector. Just bail out for now.
26386 EVT SrcVT = N0.getOperand(0).getValueType();
26387
26388 // Don't try to do this optimization when the setcc itself has i1 operands.
26389 // There are no legal vectors of i1, so this would be pointless. v1f16 is
26390 // ruled out to prevent the creation of setcc that need to be scalarized.
26391 if (SrcVT == MVT::i1 ||
26392 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
26393 return SDValue();
26394
26395 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
26396 if (!ResVT.isVector() || NumMaskElts == 0)
26397 return SDValue();
26398
26399 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
26401
26402 // Also bail out if the vector CCVT isn't the same size as ResVT.
26403 // This can happen if the SETCC operand size doesn't divide the ResVT size
26404 // (e.g., f64 vs v3f32).
26405 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
26406 return SDValue();
26407
26408 // Make sure we didn't create illegal types, if we're not supposed to.
26409 assert(DCI.isBeforeLegalize() ||
26410 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
26411
26412 // First perform a vector comparison, where lane 0 is the one we're interested
26413 // in.
26414 SDLoc DL(N0);
26415 SDValue LHS =
26416 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
26417 SDValue RHS =
26418 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
26419 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
26420
26421 // Now duplicate the comparison mask we want across all other lanes.
26422 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
26423 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
26424 Mask = DAG.getNode(ISD::BITCAST, DL,
26425 ResVT.changeVectorElementTypeToInteger(), Mask);
26426
26427 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
26428}
26429
26432 EVT VT = N->getValueType(0);
26433 SDLoc DL(N);
26434 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
26435 // 128bit vector version.
26436 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
26438 SmallVector<SDValue> Ops(N->ops());
26439 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
26440 DCI.DAG.getVTList(LVT), Ops)) {
26441 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
26442 DCI.DAG.getConstant(0, DL, MVT::i64));
26443 }
26444 }
26445
26446 if (N->getOpcode() == AArch64ISD::DUP) {
26447 // If the instruction is known to produce a scalar in SIMD registers, we can
26448 // duplicate it across the vector lanes using DUPLANE instead of moving it
26449 // to a GPR first. For example, this allows us to handle:
26450 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
26451 SDValue Op = N->getOperand(0);
26452 // FIXME: Ideally, we should be able to handle all instructions that
26453 // produce a scalar value in FPRs.
26454 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
26455 Op.getOpcode() == AArch64ISD::FCMGE ||
26456 Op.getOpcode() == AArch64ISD::FCMGT) {
26457 EVT ElemVT = VT.getVectorElementType();
26458 EVT ExpandedVT = VT;
26459 // Insert into a 128-bit vector to match DUPLANE's pattern.
26460 if (VT.getSizeInBits() != 128)
26461 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26462 128 / ElemVT.getSizeInBits());
26463 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
26464 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
26465 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
26466 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
26467 }
26468
26469 if (DCI.isAfterLegalizeDAG()) {
26470 // If scalar dup's operand is extract_vector_elt, try to combine them into
26471 // duplane. For example,
26472 //
26473 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
26474 // t18: v4i32 = AArch64ISD::DUP t21
26475 // ==>
26476 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
26477 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
26478 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26479 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
26480 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
26481 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
26482 EXTRACT_VEC_ELT.getOperand(1));
26483 }
26484 }
26485 }
26486
26487 return performPostLD1Combine(N, DCI, false);
26488 }
26489
26490 return SDValue();
26491}
26492
26493/// Get rid of unnecessary NVCASTs (that don't change the type).
26495 if (N->getValueType(0) == N->getOperand(0).getValueType())
26496 return N->getOperand(0);
26497 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
26498 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
26499 N->getOperand(0).getOperand(0));
26500
26501 return SDValue();
26502}
26503
26504// If all users of the globaladdr are of the form (globaladdr + constant), find
26505// the smallest constant, fold it into the globaladdr's offset and rewrite the
26506// globaladdr as (globaladdr + constant) - constant.
26508 const AArch64Subtarget *Subtarget,
26509 const TargetMachine &TM) {
26510 auto *GN = cast<GlobalAddressSDNode>(N);
26511 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
26513 return SDValue();
26514
26515 uint64_t MinOffset = -1ull;
26516 for (SDNode *N : GN->users()) {
26517 if (N->getOpcode() != ISD::ADD)
26518 return SDValue();
26519 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
26520 if (!C)
26521 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
26522 if (!C)
26523 return SDValue();
26524 MinOffset = std::min(MinOffset, C->getZExtValue());
26525 }
26526 uint64_t Offset = MinOffset + GN->getOffset();
26527
26528 // Require that the new offset is larger than the existing one. Otherwise, we
26529 // can end up oscillating between two possible DAGs, for example,
26530 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
26531 if (Offset <= uint64_t(GN->getOffset()))
26532 return SDValue();
26533
26534 // Check whether folding this offset is legal. It must not go out of bounds of
26535 // the referenced object to avoid violating the code model, and must be
26536 // smaller than 2^20 because this is the largest offset expressible in all
26537 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
26538 // stores an immediate signed 21 bit offset.)
26539 //
26540 // This check also prevents us from folding negative offsets, which will end
26541 // up being treated in the same way as large positive ones. They could also
26542 // cause code model violations, and aren't really common enough to matter.
26543 if (Offset >= (1 << 20))
26544 return SDValue();
26545
26546 const GlobalValue *GV = GN->getGlobal();
26547 Type *T = GV->getValueType();
26548 if (!T->isSized() ||
26550 return SDValue();
26551
26552 SDLoc DL(GN);
26553 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
26554 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
26555 DAG.getConstant(MinOffset, DL, MVT::i64));
26556}
26557
26559 const AArch64Subtarget *Subtarget) {
26560 SDValue BR = N->getOperand(0);
26561 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
26563 return SDValue();
26564
26565 SDLoc DL(N);
26566 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
26567}
26568
26569// Turns the vector of indices into a vector of byte offstes by scaling Offset
26570// by (BitWidth / 8).
26572 SDLoc DL, unsigned BitWidth) {
26573 assert(Offset.getValueType().isScalableVector() &&
26574 "This method is only for scalable vectors of offsets");
26575
26576 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
26577 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
26578
26579 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
26580}
26581
26582/// Check if the value of \p OffsetInBytes can be used as an immediate for
26583/// the gather load/prefetch and scatter store instructions with vector base and
26584/// immediate offset addressing mode:
26585///
26586/// [<Zn>.[S|D]{, #<imm>}]
26587///
26588/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26589inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
26590 unsigned ScalarSizeInBytes) {
26591 // The immediate is not a multiple of the scalar size.
26592 if (OffsetInBytes % ScalarSizeInBytes)
26593 return false;
26594
26595 // The immediate is out of range.
26596 if (OffsetInBytes / ScalarSizeInBytes > 31)
26597 return false;
26598
26599 return true;
26600}
26601
26602/// Check if the value of \p Offset represents a valid immediate for the SVE
26603/// gather load/prefetch and scatter store instructiona with vector base and
26604/// immediate offset addressing mode:
26605///
26606/// [<Zn>.[S|D]{, #<imm>}]
26607///
26608/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26610 unsigned ScalarSizeInBytes) {
26611 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
26612 return OffsetConst && isValidImmForSVEVecImmAddrMode(
26613 OffsetConst->getZExtValue(), ScalarSizeInBytes);
26614}
26615
26617 unsigned Opcode,
26618 bool OnlyPackedOffsets = true) {
26619 const SDValue Src = N->getOperand(2);
26620 const EVT SrcVT = Src->getValueType(0);
26621 assert(SrcVT.isScalableVector() &&
26622 "Scatter stores are only possible for SVE vectors");
26623
26624 SDLoc DL(N);
26625 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
26626
26627 // Make sure that source data will fit into an SVE register
26629 return SDValue();
26630
26631 // For FPs, ACLE only supports _packed_ single and double precision types.
26632 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
26633 if (SrcElVT.isFloatingPoint())
26634 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
26635 ((Opcode != AArch64ISD::SST1Q_PRED &&
26636 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
26637 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
26638 return SDValue();
26639
26640 // Depending on the addressing mode, this is either a pointer or a vector of
26641 // pointers (that fits into one register)
26642 SDValue Base = N->getOperand(4);
26643 // Depending on the addressing mode, this is either a single offset or a
26644 // vector of offsets (that fits into one register)
26645 SDValue Offset = N->getOperand(5);
26646
26647 // For "scalar + vector of indices", just scale the indices. This only
26648 // applies to non-temporal scatters because there's no instruction that takes
26649 // indices.
26650 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
26651 Offset =
26653 Opcode = AArch64ISD::SSTNT1_PRED;
26654 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
26655 Offset =
26657 Opcode = AArch64ISD::SST1Q_PRED;
26658 }
26659
26660 // In the case of non-temporal gather loads there's only one SVE instruction
26661 // per data-size: "scalar + vector", i.e.
26662 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26663 // Since we do have intrinsics that allow the arguments to be in a different
26664 // order, we may need to swap them to match the spec.
26665 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
26666 Offset.getValueType().isVector())
26668
26669 // SST1_IMM requires that the offset is an immediate that is:
26670 // * a multiple of #SizeInBytes,
26671 // * in the range [0, 31 x #SizeInBytes],
26672 // where #SizeInBytes is the size in bytes of the stored items. For
26673 // immediates outside that range and non-immediate scalar offsets use SST1 or
26674 // SST1_UXTW instead.
26675 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
26677 SrcVT.getScalarSizeInBits() / 8)) {
26678 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26679 Opcode = AArch64ISD::SST1_UXTW_PRED;
26680 else
26681 Opcode = AArch64ISD::SST1_PRED;
26682
26684 }
26685 }
26686
26687 auto &TLI = DAG.getTargetLoweringInfo();
26688 if (!TLI.isTypeLegal(Base.getValueType()))
26689 return SDValue();
26690
26691 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
26692 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26693 // nxv2i64. Legalize accordingly.
26694 if (!OnlyPackedOffsets &&
26695 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26696 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26697
26698 if (!TLI.isTypeLegal(Offset.getValueType()))
26699 return SDValue();
26700
26701 // Source value type that is representable in hardware
26702 EVT HwSrcVt = getSVEContainerType(SrcVT);
26703
26704 // Keep the original type of the input data to store - this is needed to be
26705 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
26706 // FP values we want the integer equivalent, so just use HwSrcVt.
26707 SDValue InputVT = DAG.getValueType(SrcVT);
26708 if (SrcVT.isFloatingPoint())
26709 InputVT = DAG.getValueType(HwSrcVt);
26710
26711 SDVTList VTs = DAG.getVTList(MVT::Other);
26712 SDValue SrcNew;
26713
26714 if (Src.getValueType().isFloatingPoint())
26715 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
26716 else
26717 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
26718
26719 SDValue Ops[] = {N->getOperand(0), // Chain
26720 SrcNew,
26721 N->getOperand(3), // Pg
26722 Base,
26723 Offset,
26724 InputVT};
26725
26726 return DAG.getNode(Opcode, DL, VTs, Ops);
26727}
26728
26730 unsigned Opcode,
26731 bool OnlyPackedOffsets = true) {
26732 const EVT RetVT = N->getValueType(0);
26733 assert(RetVT.isScalableVector() &&
26734 "Gather loads are only possible for SVE vectors");
26735
26736 SDLoc DL(N);
26737
26738 // Make sure that the loaded data will fit into an SVE register
26740 return SDValue();
26741
26742 // Depending on the addressing mode, this is either a pointer or a vector of
26743 // pointers (that fits into one register)
26744 SDValue Base = N->getOperand(3);
26745 // Depending on the addressing mode, this is either a single offset or a
26746 // vector of offsets (that fits into one register)
26747 SDValue Offset = N->getOperand(4);
26748
26749 // For "scalar + vector of indices", scale the indices to obtain unscaled
26750 // offsets. This applies to non-temporal and quadword gathers, which do not
26751 // have an addressing mode with scaled offset.
26752 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
26754 RetVT.getScalarSizeInBits());
26755 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
26756 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
26758 RetVT.getScalarSizeInBits());
26759 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
26760 }
26761
26762 // In the case of non-temporal gather loads and quadword gather loads there's
26763 // only one addressing mode : "vector + scalar", e.g.
26764 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26765 // Since we do have intrinsics that allow the arguments to be in a different
26766 // order, we may need to swap them to match the spec.
26767 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
26768 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
26769 Offset.getValueType().isVector())
26771
26772 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26773 // * a multiple of #SizeInBytes,
26774 // * in the range [0, 31 x #SizeInBytes],
26775 // where #SizeInBytes is the size in bytes of the loaded items. For
26776 // immediates outside that range and non-immediate scalar offsets use
26777 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26778 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26779 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26781 RetVT.getScalarSizeInBits() / 8)) {
26782 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26783 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26784 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26785 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26786 else
26787 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26788 ? AArch64ISD::GLD1_MERGE_ZERO
26789 : AArch64ISD::GLDFF1_MERGE_ZERO;
26790
26792 }
26793 }
26794
26795 auto &TLI = DAG.getTargetLoweringInfo();
26796 if (!TLI.isTypeLegal(Base.getValueType()))
26797 return SDValue();
26798
26799 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26800 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26801 // nxv2i64. Legalize accordingly.
26802 if (!OnlyPackedOffsets &&
26803 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26804 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26805
26806 // Return value type that is representable in hardware
26807 EVT HwRetVt = getSVEContainerType(RetVT);
26808
26809 // Keep the original output value type around - this is needed to be able to
26810 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26811 // values we want the integer equivalent, so just use HwRetVT.
26812 SDValue OutVT = DAG.getValueType(RetVT);
26813 if (RetVT.isFloatingPoint())
26814 OutVT = DAG.getValueType(HwRetVt);
26815
26816 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
26817 SDValue Ops[] = {N->getOperand(0), // Chain
26818 N->getOperand(2), // Pg
26819 Base, Offset, OutVT};
26820
26821 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
26822 SDValue LoadChain = SDValue(Load.getNode(), 1);
26823
26824 if (RetVT.isInteger() && (RetVT != HwRetVt))
26825 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
26826
26827 // If the original return value was FP, bitcast accordingly. Doing it here
26828 // means that we can avoid adding TableGen patterns for FPs.
26829 if (RetVT.isFloatingPoint())
26830 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
26831
26832 return DAG.getMergeValues({Load, LoadChain}, DL);
26833}
26834
26835static SDValue
26837 SelectionDAG &DAG) {
26838 SDLoc DL(N);
26839 SDValue Src = N->getOperand(0);
26840 unsigned Opc = Src->getOpcode();
26841
26842 // Sign extend of an unsigned unpack -> signed unpack
26843 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
26844
26845 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
26846 : AArch64ISD::SUNPKLO;
26847
26848 // Push the sign extend to the operand of the unpack
26849 // This is necessary where, for example, the operand of the unpack
26850 // is another unpack:
26851 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
26852 // ->
26853 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
26854 // ->
26855 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
26856 SDValue ExtOp = Src->getOperand(0);
26857 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
26858 EVT EltTy = VT.getVectorElementType();
26859 (void)EltTy;
26860
26861 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
26862 "Sign extending from an invalid type");
26863
26864 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
26865
26867 ExtOp, DAG.getValueType(ExtVT));
26868
26869 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
26870 }
26871
26872 // Sign extend of CSET -> CSETM.
26873 if (Opc == AArch64ISD::CSEL &&
26874 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
26875 EVT VT = N->getValueType(0);
26876 SDValue TVal = Src.getOperand(0);
26877 SDValue FVal = Src.getOperand(1);
26878
26879 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
26880 if (isNullConstant(TVal) && isOneConstant(FVal))
26881 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
26882 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
26883 Src.getOperand(3));
26884
26885 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
26886 if (isOneConstant(TVal) && isNullConstant(FVal))
26887 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
26888 DAG.getAllOnesConstant(DL, VT), FVal,
26889 Src.getOperand(2), Src.getOperand(3));
26890 }
26891
26892 if (DCI.isBeforeLegalizeOps())
26893 return SDValue();
26894
26896 return SDValue();
26897
26898 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
26899 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
26900 unsigned NewOpc;
26901 unsigned MemVTOpNum = 4;
26902 switch (Opc) {
26903 case AArch64ISD::LD1_MERGE_ZERO:
26904 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
26905 MemVTOpNum = 3;
26906 break;
26907 case AArch64ISD::LDNF1_MERGE_ZERO:
26908 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
26909 MemVTOpNum = 3;
26910 break;
26911 case AArch64ISD::LDFF1_MERGE_ZERO:
26912 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
26913 MemVTOpNum = 3;
26914 break;
26915 case AArch64ISD::GLD1_MERGE_ZERO:
26916 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
26917 break;
26918 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26919 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
26920 break;
26921 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26922 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
26923 break;
26924 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26925 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
26926 break;
26927 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26928 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
26929 break;
26930 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26931 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
26932 break;
26933 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26934 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
26935 break;
26936 case AArch64ISD::GLDFF1_MERGE_ZERO:
26937 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
26938 break;
26939 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
26940 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
26941 break;
26942 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
26943 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
26944 break;
26945 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
26946 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
26947 break;
26948 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
26949 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
26950 break;
26951 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
26952 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
26953 break;
26954 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
26955 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
26956 break;
26957 case AArch64ISD::GLDNT1_MERGE_ZERO:
26958 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
26959 break;
26960 default:
26961 return SDValue();
26962 }
26963
26964 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
26965 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
26966
26967 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
26968 return SDValue();
26969
26970 EVT DstVT = N->getValueType(0);
26971 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
26972
26974 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
26975 Ops.push_back(Src->getOperand(I));
26976
26977 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
26978 DCI.CombineTo(N, ExtLoad);
26979 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
26980
26981 // Return N so it doesn't get rechecked
26982 return SDValue(N, 0);
26983}
26984
26985/// Legalize the gather prefetch (scalar + vector addressing mode) when the
26986/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
26987/// != nxv2i32) do not need legalization.
26989 const unsigned OffsetPos = 4;
26990 SDValue Offset = N->getOperand(OffsetPos);
26991
26992 // Not an unpacked vector, bail out.
26993 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
26994 return SDValue();
26995
26996 // Extend the unpacked offset vector to 64-bit lanes.
26997 SDLoc DL(N);
26998 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
27000 // Replace the offset operand with the 64-bit one.
27001 Ops[OffsetPos] = Offset;
27002
27003 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27004}
27005
27006/// Combines a node carrying the intrinsic
27007/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
27008/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
27009/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
27010/// sve gather prefetch instruction with vector plus immediate addressing mode.
27012 unsigned ScalarSizeInBytes) {
27013 const unsigned ImmPos = 4, OffsetPos = 3;
27014 // No need to combine the node if the immediate is valid...
27015 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
27016 return SDValue();
27017
27018 // ...otherwise swap the offset base with the offset...
27020 std::swap(Ops[ImmPos], Ops[OffsetPos]);
27021 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
27022 // `aarch64_sve_prfb_gather_uxtw_index`.
27023 SDLoc DL(N);
27024 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
27025 MVT::i64);
27026
27027 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27028}
27029
27030// Return true if the vector operation can guarantee only the first lane of its
27031// result contains data, with all bits in other lanes set to zero.
27033 switch (Op.getOpcode()) {
27034 default:
27035 return false;
27036 case AArch64ISD::ANDV_PRED:
27037 case AArch64ISD::EORV_PRED:
27038 case AArch64ISD::FADDA_PRED:
27039 case AArch64ISD::FADDV_PRED:
27040 case AArch64ISD::FMAXNMV_PRED:
27041 case AArch64ISD::FMAXV_PRED:
27042 case AArch64ISD::FMINNMV_PRED:
27043 case AArch64ISD::FMINV_PRED:
27044 case AArch64ISD::ORV_PRED:
27045 case AArch64ISD::SADDV_PRED:
27046 case AArch64ISD::SMAXV_PRED:
27047 case AArch64ISD::SMINV_PRED:
27048 case AArch64ISD::UADDV_PRED:
27049 case AArch64ISD::UMAXV_PRED:
27050 case AArch64ISD::UMINV_PRED:
27051 return true;
27052 }
27053}
27054
27056 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
27057 SDValue InsertVec = N->getOperand(0);
27058 SDValue InsertElt = N->getOperand(1);
27059 SDValue InsertIdx = N->getOperand(2);
27060
27061 // We only care about inserts into the first element...
27062 if (!isNullConstant(InsertIdx))
27063 return SDValue();
27064 // ...of a zero'd vector...
27066 return SDValue();
27067 // ...where the inserted data was previously extracted...
27068 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
27069 return SDValue();
27070
27071 SDValue ExtractVec = InsertElt.getOperand(0);
27072 SDValue ExtractIdx = InsertElt.getOperand(1);
27073
27074 // ...from the first element of a vector.
27075 if (!isNullConstant(ExtractIdx))
27076 return SDValue();
27077
27078 // If we get here we are effectively trying to zero lanes 1-N of a vector.
27079
27080 // Ensure there's no type conversion going on.
27081 if (N->getValueType(0) != ExtractVec.getValueType())
27082 return SDValue();
27083
27084 if (!isLanes1toNKnownZero(ExtractVec))
27085 return SDValue();
27086
27087 // The explicit zeroing is redundant.
27088 return ExtractVec;
27089}
27090
27091static SDValue
27094 return Res;
27095
27096 return performPostLD1Combine(N, DCI, true);
27097}
27098
27101 const AArch64Subtarget *Subtarget) {
27102 SDValue N0 = N->getOperand(0);
27103 EVT VT = N->getValueType(0);
27104
27105 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
27106 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
27107 return SDValue();
27108
27109 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
27110 EVT EltVT = VT.getVectorElementType();
27111 return EltVT == MVT::f32 || EltVT == MVT::f64;
27112 };
27113
27114 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
27115 // We purposefully don't care about legality of the nodes here as we know
27116 // they can be split down into something legal.
27117 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
27118 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
27119 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
27120 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
27121 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
27122 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
27123 LN0->getChain(), LN0->getBasePtr(),
27124 N0.getValueType(), LN0->getMemOperand());
27125 DCI.CombineTo(N, ExtLoad);
27126 DCI.CombineTo(
27127 N0.getNode(),
27128 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
27129 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
27130 ExtLoad.getValue(1));
27131 return SDValue(N, 0); // Return N so it doesn't get rechecked!
27132 }
27133
27134 return SDValue();
27135}
27136
27138 const AArch64Subtarget *Subtarget) {
27139 EVT VT = N->getValueType(0);
27140
27141 // Don't expand for NEON, SVE2 or SME
27142 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
27143 return SDValue();
27144
27145 SDLoc DL(N);
27146
27147 SDValue Mask = N->getOperand(0);
27148 SDValue In1 = N->getOperand(1);
27149 SDValue In2 = N->getOperand(2);
27150
27151 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
27152 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
27153 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
27154 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
27155}
27156
27158 EVT VT = N->getValueType(0);
27159
27160 SDValue Insert = N->getOperand(0);
27161 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
27162 return SDValue();
27163
27164 if (!Insert.getOperand(0).isUndef())
27165 return SDValue();
27166
27167 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
27168 uint64_t IdxDupLane = N->getConstantOperandVal(1);
27169 if (IdxInsert != 0 || IdxDupLane != 0)
27170 return SDValue();
27171
27172 SDValue Bitcast = Insert.getOperand(1);
27173 if (Bitcast.getOpcode() != ISD::BITCAST)
27174 return SDValue();
27175
27176 SDValue Subvec = Bitcast.getOperand(0);
27177 EVT SubvecVT = Subvec.getValueType();
27178 if (!SubvecVT.is128BitVector())
27179 return SDValue();
27180 EVT NewSubvecVT =
27182
27183 SDLoc DL(N);
27184 SDValue NewInsert =
27185 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
27186 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
27187 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
27188 NewInsert, N->getOperand(1));
27189 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
27190}
27191
27192// Try to combine mull with uzp1.
27195 SelectionDAG &DAG) {
27196 if (DCI.isBeforeLegalizeOps())
27197 return SDValue();
27198
27199 SDValue LHS = N->getOperand(0);
27200 SDValue RHS = N->getOperand(1);
27201
27202 SDValue ExtractHigh;
27203 SDValue ExtractLow;
27204 SDValue TruncHigh;
27205 SDValue TruncLow;
27206 SDLoc DL(N);
27207
27208 // Check the operands are trunc and extract_high.
27210 RHS.getOpcode() == ISD::TRUNCATE) {
27211 TruncHigh = RHS;
27212 if (LHS.getOpcode() == ISD::BITCAST)
27213 ExtractHigh = LHS.getOperand(0);
27214 else
27215 ExtractHigh = LHS;
27217 LHS.getOpcode() == ISD::TRUNCATE) {
27218 TruncHigh = LHS;
27219 if (RHS.getOpcode() == ISD::BITCAST)
27220 ExtractHigh = RHS.getOperand(0);
27221 else
27222 ExtractHigh = RHS;
27223 } else
27224 return SDValue();
27225
27226 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27227 // with uzp1.
27228 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27229 SDValue TruncHighOp = TruncHigh.getOperand(0);
27230 EVT TruncHighOpVT = TruncHighOp.getValueType();
27231 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
27232 DAG.isSplatValue(TruncHighOp, false))
27233 return SDValue();
27234
27235 // Check there is other extract_high with same source vector.
27236 // For example,
27237 //
27238 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
27239 // t12: v4i16 = truncate t11
27240 // t31: v4i32 = AArch64ISD::SMULL t18, t12
27241 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
27242 // t16: v4i16 = truncate t15
27243 // t30: v4i32 = AArch64ISD::SMULL t23, t1
27244 //
27245 // This dagcombine assumes the two extract_high uses same source vector in
27246 // order to detect the pair of the mull. If they have different source vector,
27247 // this code will not work.
27248 // TODO: Should also try to look through a bitcast.
27249 bool HasFoundMULLow = true;
27250 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
27251 if (ExtractHighSrcVec->use_size() != 2)
27252 HasFoundMULLow = false;
27253
27254 // Find ExtractLow.
27255 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
27256 if (User == ExtractHigh.getNode())
27257 continue;
27258
27259 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27261 HasFoundMULLow = false;
27262 break;
27263 }
27264
27265 ExtractLow.setNode(User);
27266 }
27267
27268 if (!ExtractLow || !ExtractLow->hasOneUse())
27269 HasFoundMULLow = false;
27270
27271 // Check ExtractLow's user.
27272 if (HasFoundMULLow) {
27273 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
27274 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
27275 HasFoundMULLow = false;
27276 } else {
27277 if (ExtractLowUser->getOperand(0) == ExtractLow) {
27278 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
27279 TruncLow = ExtractLowUser->getOperand(1);
27280 else
27281 HasFoundMULLow = false;
27282 } else {
27283 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
27284 TruncLow = ExtractLowUser->getOperand(0);
27285 else
27286 HasFoundMULLow = false;
27287 }
27288 }
27289 }
27290
27291 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27292 // with uzp1.
27293 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27294 EVT TruncHighVT = TruncHigh.getValueType();
27295 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27296 SDValue TruncLowOp =
27297 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
27298 EVT TruncLowOpVT = TruncLowOp.getValueType();
27299 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
27300 DAG.isSplatValue(TruncLowOp, false)))
27301 return SDValue();
27302
27303 // Create uzp1, extract_high and extract_low.
27304 if (TruncHighOpVT != UZP1VT)
27305 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
27306 if (TruncLowOpVT != UZP1VT)
27307 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
27308
27309 SDValue UZP1 =
27310 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
27311 SDValue HighIdxCst =
27312 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
27313 SDValue NewTruncHigh =
27314 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
27315 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
27316
27317 if (HasFoundMULLow) {
27318 EVT TruncLowVT = TruncLow.getValueType();
27319 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
27320 UZP1, ExtractLow.getOperand(1));
27321 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
27322 }
27323
27324 return SDValue(N, 0);
27325}
27326
27329 SelectionDAG &DAG) {
27330 if (SDValue Val =
27332 return Val;
27333
27334 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
27335 return Val;
27336
27337 return SDValue();
27338}
27339
27340static SDValue
27342 SelectionDAG &DAG) {
27343 // Let's do below transform.
27344 //
27345 // t34: v4i32 = AArch64ISD::UADDLV t2
27346 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
27347 // t7: i64 = zero_extend t35
27348 // t20: v1i64 = scalar_to_vector t7
27349 // ==>
27350 // t34: v4i32 = AArch64ISD::UADDLV t2
27351 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
27352 // t40: v1i64 = AArch64ISD::NVCAST t39
27353 if (DCI.isBeforeLegalizeOps())
27354 return SDValue();
27355
27356 EVT VT = N->getValueType(0);
27357 if (VT != MVT::v1i64)
27358 return SDValue();
27359
27360 SDValue ZEXT = N->getOperand(0);
27361 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
27362 return SDValue();
27363
27364 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
27365 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
27366 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
27367 return SDValue();
27368
27369 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
27370 return SDValue();
27371
27372 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
27373 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
27374 UADDLV.getValueType() != MVT::v4i32 ||
27375 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
27376 return SDValue();
27377
27378 // Let's generate new sequence with AArch64ISD::NVCAST.
27379 SDLoc DL(N);
27380 SDValue EXTRACT_SUBVEC =
27381 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
27382 DAG.getConstant(0, DL, MVT::i64));
27383 SDValue NVCAST =
27384 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
27385
27386 return NVCAST;
27387}
27388
27391 if (!DCI.isBeforeLegalize())
27392 return SDValue();
27393
27394 unsigned NumParts = N->getNumOperands();
27395 if (NumParts != 2 && NumParts != 4)
27396 return SDValue();
27397
27398 EVT SubVecTy = N->getValueType(0);
27399
27400 // At the moment we're unlikely to see a fixed-width vector deinterleave as
27401 // we usually generate shuffles instead.
27402 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
27403 if (!SubVecTy.isScalableVector() ||
27404 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
27405 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
27406 return SDValue();
27407
27408 // Make sure each input operand is the correct extract_subvector of the same
27409 // wider vector.
27410 SDValue Op0 = N->getOperand(0);
27411 for (unsigned I = 0; I < NumParts; I++) {
27412 SDValue OpI = N->getOperand(I);
27413 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27414 OpI->getOperand(0) != Op0->getOperand(0))
27415 return SDValue();
27416 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
27417 return SDValue();
27418 }
27419
27420 // Normal loads are currently already handled by the InterleavedAccessPass so
27421 // we don't expect to see them here. Bail out if the masked load has an
27422 // unexpected number of uses, since we want to avoid a situation where we have
27423 // both deinterleaving loads and normal loads in the same block. Also, discard
27424 // masked loads that are extending, indexed, have an unexpected offset or have
27425 // an unsupported passthru value until we find a valid use case.
27426 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
27427 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
27428 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
27429 !MaskedLoad->getOffset().isUndef() ||
27430 (!MaskedLoad->getPassThru()->isUndef() &&
27431 !isZerosVector(MaskedLoad->getPassThru().getNode())))
27432 return SDValue();
27433
27434 // Now prove that the mask is an interleave of identical masks.
27435 SDLoc DL(N);
27436 SDValue NarrowMask =
27437 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
27438 if (!NarrowMask)
27439 return SDValue();
27440
27441 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
27442 : Intrinsic::aarch64_sve_ld4_sret;
27443 SDValue NewLdOps[] = {MaskedLoad->getChain(),
27444 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
27445 MaskedLoad->getBasePtr()};
27446 SDValue Res;
27447 if (NumParts == 2)
27449 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
27450 else
27452 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
27453 NewLdOps);
27454
27455 // We can now generate a structured load!
27456 SmallVector<SDValue, 4> ResOps(NumParts);
27457 for (unsigned Idx = 0; Idx < NumParts; Idx++)
27458 ResOps[Idx] = SDValue(Res.getNode(), Idx);
27459
27460 // Replace uses of the original chain result with the new chain result.
27461 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
27462 SDValue(Res.getNode(), NumParts));
27463 return DCI.CombineTo(N, ResOps, false);
27464}
27465
27466/// If the operand is a bitwise AND with a constant RHS, and the shift has a
27467/// constant RHS and is the only use, we can pull it out of the shift, i.e.
27468///
27469/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
27470///
27471/// We prefer this canonical form to match existing isel patterns.
27474 SelectionDAG &DAG) {
27475 if (DCI.isBeforeLegalizeOps())
27476 return SDValue();
27477
27478 SDValue Op0 = N->getOperand(0);
27479 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
27480 return SDValue();
27481
27482 SDValue C1 = Op0->getOperand(1);
27483 SDValue C2 = N->getOperand(1);
27485 return SDValue();
27486
27487 // Might be folded into shifted op, do not lower.
27488 if (N->hasOneUse()) {
27489 unsigned UseOpc = N->user_begin()->getOpcode();
27490 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
27491 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
27492 return SDValue();
27493 }
27494
27495 SDLoc DL(N);
27496 EVT VT = N->getValueType(0);
27497
27498 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
27499 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
27500 // causing infinite loop. Result may also be worse.
27501 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
27502 if (!isa<ConstantSDNode>(NewRHS))
27503 return SDValue();
27504
27505 SDValue X = Op0->getOperand(0);
27506 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
27507 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
27508}
27509
27511 unsigned IntrinsicID = N->getConstantOperandVal(1);
27512 auto Register =
27513 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
27514 : AArch64SysReg::RNDRRS);
27515 SDLoc DL(N);
27516 SDValue A = DAG.getNode(
27517 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
27518 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
27519 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
27520 DAG.getConstant(0, DL, MVT::i32),
27521 DAG.getConstant(0, DL, MVT::i32),
27522 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
27523 return DAG.getMergeValues(
27524 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
27525}
27526
27528 DAGCombinerInfo &DCI) const {
27529 SelectionDAG &DAG = DCI.DAG;
27530 switch (N->getOpcode()) {
27531 default:
27532 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
27533 break;
27535 return performVectorDeinterleaveCombine(N, DCI, DAG);
27536 case ISD::VECREDUCE_AND:
27537 case ISD::VECREDUCE_OR:
27538 case ISD::VECREDUCE_XOR:
27539 return performVecReduceBitwiseCombine(N, DCI, DAG);
27540 case ISD::ADD:
27541 case ISD::SUB:
27542 return performAddSubCombine(N, DCI);
27543 case ISD::BUILD_VECTOR:
27544 return performBuildVectorCombine(N, DCI, DAG);
27545 case ISD::SMIN:
27546 return performSMINCombine(N, DAG);
27547 case ISD::TRUNCATE:
27548 return performTruncateCombine(N, DAG, DCI);
27549 case AArch64ISD::ANDS:
27550 return performFlagSettingCombine(N, DCI, ISD::AND);
27551 case AArch64ISD::ADC:
27552 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27553 return R;
27554 return foldADCToCINC(N, DAG);
27555 case AArch64ISD::SBC:
27556 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
27557 case AArch64ISD::ADCS:
27558 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27559 return R;
27560 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
27561 case AArch64ISD::SBCS:
27562 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
27563 return R;
27564 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
27565 case AArch64ISD::ADDS:
27566 return performFlagSettingCombine(N, DCI, ISD::ADD);
27567 case AArch64ISD::SUBS:
27568 return performFlagSettingCombine(N, DCI, ISD::SUB);
27569 case AArch64ISD::BICi: {
27571 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
27572 APInt DemandedElts =
27573 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
27574
27576 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
27577 return SDValue();
27578
27579 break;
27580 }
27581 case ISD::XOR:
27582 return performXorCombine(N, DAG, DCI, Subtarget);
27583 case ISD::MUL:
27584 return performMulCombine(N, DAG, DCI, Subtarget);
27585 case ISD::SINT_TO_FP:
27586 case ISD::UINT_TO_FP:
27587 return performIntToFpCombine(N, DAG, DCI, Subtarget);
27588 case ISD::FP_TO_SINT:
27589 case ISD::FP_TO_UINT:
27592 return performFpToIntCombine(N, DAG, DCI, Subtarget);
27593 case ISD::OR:
27594 return performORCombine(N, DCI, Subtarget, *this);
27595 case ISD::AND:
27596 return performANDCombine(N, DCI);
27597 case ISD::FADD:
27598 return performFADDCombine(N, DCI);
27600 return performIntrinsicCombine(N, DCI, Subtarget);
27601 case ISD::ANY_EXTEND:
27602 case ISD::ZERO_EXTEND:
27603 case ISD::SIGN_EXTEND:
27604 return performExtendCombine(N, DCI, DAG);
27606 return performSignExtendInRegCombine(N, DCI, DAG);
27608 return performConcatVectorsCombine(N, DCI, DAG);
27610 return performExtractSubvectorCombine(N, DCI, DAG);
27612 return performInsertSubvectorCombine(N, DCI, DAG);
27613 case ISD::SELECT:
27614 return performSelectCombine(N, DCI);
27615 case ISD::VSELECT:
27616 return performVSelectCombine(N, DCI.DAG);
27617 case ISD::SETCC:
27618 return performSETCCCombine(N, DCI, DAG);
27619 case ISD::LOAD:
27620 return performLOADCombine(N, DCI, DAG, Subtarget);
27621 case ISD::STORE:
27622 return performSTORECombine(N, DCI, DAG, Subtarget);
27623 case ISD::MSTORE:
27624 return performMSTORECombine(N, DCI, DAG, Subtarget);
27625 case ISD::MGATHER:
27626 case ISD::MSCATTER:
27627 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
27628 return performMaskedGatherScatterCombine(N, DCI, DAG);
27629 case ISD::FP_EXTEND:
27630 return performFPExtendCombine(N, DAG, DCI, Subtarget);
27631 case AArch64ISD::BRCOND:
27632 return performBRCONDCombine(N, DCI, DAG);
27633 case AArch64ISD::TBNZ:
27634 case AArch64ISD::TBZ:
27635 return performTBZCombine(N, DCI, DAG);
27636 case AArch64ISD::CSEL:
27637 return performCSELCombine(N, DCI, DAG);
27638 case AArch64ISD::DUP:
27639 case AArch64ISD::DUPLANE8:
27640 case AArch64ISD::DUPLANE16:
27641 case AArch64ISD::DUPLANE32:
27642 case AArch64ISD::DUPLANE64:
27643 return performDUPCombine(N, DCI);
27644 case AArch64ISD::DUPLANE128:
27645 return performDupLane128Combine(N, DAG);
27646 case AArch64ISD::NVCAST:
27647 return performNVCASTCombine(N, DAG);
27648 case AArch64ISD::SPLICE:
27649 return performSpliceCombine(N, DAG);
27650 case AArch64ISD::UUNPKLO:
27651 case AArch64ISD::UUNPKHI:
27652 return performUnpackCombine(N, DAG, Subtarget);
27653 case AArch64ISD::UZP1:
27654 case AArch64ISD::UZP2:
27655 return performUzpCombine(N, DAG, Subtarget);
27656 case AArch64ISD::SETCC_MERGE_ZERO:
27657 return performSetccMergeZeroCombine(N, DCI);
27658 case AArch64ISD::REINTERPRET_CAST:
27660 case AArch64ISD::GLD1_MERGE_ZERO:
27661 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27662 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27663 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27664 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27665 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27666 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27667 case AArch64ISD::GLD1S_MERGE_ZERO:
27668 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
27669 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
27670 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
27671 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
27672 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
27673 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
27674 return performGLD1Combine(N, DAG);
27675 case AArch64ISD::VASHR:
27676 case AArch64ISD::VLSHR:
27677 return performVectorShiftCombine(N, *this, DCI);
27678 case AArch64ISD::SUNPKLO:
27679 return performSunpkloCombine(N, DAG);
27680 case AArch64ISD::BSP:
27681 return performBSPExpandForSVE(N, DAG, Subtarget);
27683 return performInsertVectorEltCombine(N, DCI);
27685 return performExtractVectorEltCombine(N, DCI, Subtarget);
27686 case ISD::VECREDUCE_ADD:
27687 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
27688 case ISD::GET_ACTIVE_LANE_MASK:
27689 return performActiveLaneMaskCombine(N, DCI, Subtarget);
27690 case AArch64ISD::UADDV:
27691 return performUADDVCombine(N, DAG);
27692 case AArch64ISD::SMULL:
27693 case AArch64ISD::UMULL:
27694 case AArch64ISD::PMULL:
27695 return performMULLCombine(N, DCI, DAG);
27698 switch (N->getConstantOperandVal(1)) {
27699 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
27700 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
27701 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
27702 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
27703 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
27704 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
27705 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
27706 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
27707 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
27708 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
27709 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
27710 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
27711 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
27712 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
27713 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
27714 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
27716 case Intrinsic::aarch64_neon_ld2:
27717 case Intrinsic::aarch64_neon_ld3:
27718 case Intrinsic::aarch64_neon_ld4:
27719 case Intrinsic::aarch64_neon_ld1x2:
27720 case Intrinsic::aarch64_neon_ld1x3:
27721 case Intrinsic::aarch64_neon_ld1x4:
27722 case Intrinsic::aarch64_neon_ld2lane:
27723 case Intrinsic::aarch64_neon_ld3lane:
27724 case Intrinsic::aarch64_neon_ld4lane:
27725 case Intrinsic::aarch64_neon_ld2r:
27726 case Intrinsic::aarch64_neon_ld3r:
27727 case Intrinsic::aarch64_neon_ld4r:
27728 case Intrinsic::aarch64_neon_st2:
27729 case Intrinsic::aarch64_neon_st3:
27730 case Intrinsic::aarch64_neon_st4:
27731 case Intrinsic::aarch64_neon_st1x2:
27732 case Intrinsic::aarch64_neon_st1x3:
27733 case Intrinsic::aarch64_neon_st1x4:
27734 case Intrinsic::aarch64_neon_st2lane:
27735 case Intrinsic::aarch64_neon_st3lane:
27736 case Intrinsic::aarch64_neon_st4lane:
27737 return performNEONPostLDSTCombine(N, DCI, DAG);
27738 case Intrinsic::aarch64_sve_ldnt1:
27739 return performLDNT1Combine(N, DAG);
27740 case Intrinsic::aarch64_sve_ld1rq:
27742 case Intrinsic::aarch64_sve_ld1ro:
27744 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
27745 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27746 case Intrinsic::aarch64_sve_ldnt1_gather:
27747 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27748 case Intrinsic::aarch64_sve_ldnt1_gather_index:
27749 return performGatherLoadCombine(N, DAG,
27750 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
27751 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
27752 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27753 case Intrinsic::aarch64_sve_ld1:
27754 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
27755 case Intrinsic::aarch64_sve_ldnf1:
27756 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
27757 case Intrinsic::aarch64_sve_ldff1:
27758 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
27759 case Intrinsic::aarch64_sve_st1:
27760 return performST1Combine(N, DAG);
27761 case Intrinsic::aarch64_sve_stnt1:
27762 return performSTNT1Combine(N, DAG);
27763 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
27764 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27765 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
27766 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27767 case Intrinsic::aarch64_sve_stnt1_scatter:
27768 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27769 case Intrinsic::aarch64_sve_stnt1_scatter_index:
27770 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
27771 case Intrinsic::aarch64_sve_ld1_gather:
27772 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
27773 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
27774 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
27775 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
27776 case Intrinsic::aarch64_sve_ld1q_gather_index:
27777 return performGatherLoadCombine(N, DAG,
27778 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
27779 case Intrinsic::aarch64_sve_ld1_gather_index:
27780 return performGatherLoadCombine(N, DAG,
27781 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
27782 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
27783 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
27784 /*OnlyPackedOffsets=*/false);
27785 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
27786 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
27787 /*OnlyPackedOffsets=*/false);
27788 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
27789 return performGatherLoadCombine(N, DAG,
27790 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
27791 /*OnlyPackedOffsets=*/false);
27792 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
27793 return performGatherLoadCombine(N, DAG,
27794 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
27795 /*OnlyPackedOffsets=*/false);
27796 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
27797 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
27798 case Intrinsic::aarch64_sve_ldff1_gather:
27799 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
27800 case Intrinsic::aarch64_sve_ldff1_gather_index:
27801 return performGatherLoadCombine(N, DAG,
27802 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
27803 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
27804 return performGatherLoadCombine(N, DAG,
27805 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
27806 /*OnlyPackedOffsets=*/false);
27807 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
27808 return performGatherLoadCombine(N, DAG,
27809 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
27810 /*OnlyPackedOffsets=*/false);
27811 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
27812 return performGatherLoadCombine(N, DAG,
27813 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
27814 /*OnlyPackedOffsets=*/false);
27815 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
27816 return performGatherLoadCombine(N, DAG,
27817 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
27818 /*OnlyPackedOffsets=*/false);
27819 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
27820 return performGatherLoadCombine(N, DAG,
27821 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
27822 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
27823 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
27824 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
27825 case Intrinsic::aarch64_sve_st1q_scatter_index:
27826 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
27827 case Intrinsic::aarch64_sve_st1_scatter:
27828 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
27829 case Intrinsic::aarch64_sve_st1_scatter_index:
27830 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
27831 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
27832 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
27833 /*OnlyPackedOffsets=*/false);
27834 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
27835 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
27836 /*OnlyPackedOffsets=*/false);
27837 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
27838 return performScatterStoreCombine(N, DAG,
27839 AArch64ISD::SST1_SXTW_SCALED_PRED,
27840 /*OnlyPackedOffsets=*/false);
27841 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
27842 return performScatterStoreCombine(N, DAG,
27843 AArch64ISD::SST1_UXTW_SCALED_PRED,
27844 /*OnlyPackedOffsets=*/false);
27845 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
27846 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
27847 case Intrinsic::aarch64_rndr:
27848 case Intrinsic::aarch64_rndrrs:
27849 return performRNDRCombine(N, DAG);
27850 case Intrinsic::aarch64_sme_ldr_zt:
27851 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
27852 DAG.getVTList(MVT::Other), N->getOperand(0),
27853 N->getOperand(2), N->getOperand(3));
27854 case Intrinsic::aarch64_sme_str_zt:
27855 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
27856 DAG.getVTList(MVT::Other), N->getOperand(0),
27857 N->getOperand(2), N->getOperand(3));
27858 default:
27859 break;
27860 }
27861 break;
27862 case ISD::GlobalAddress:
27863 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
27864 case ISD::CTLZ:
27865 return performCTLZCombine(N, DAG, Subtarget);
27867 return performScalarToVectorCombine(N, DCI, DAG);
27868 case ISD::SHL:
27869 return performSHLCombine(N, DCI, DAG);
27870 }
27871 return SDValue();
27872}
27873
27874// Check if the return value is used as only a return value, as otherwise
27875// we can't perform a tail-call. In particular, we need to check for
27876// target ISD nodes that are returns and any other "odd" constructs
27877// that the generic analysis code won't necessarily catch.
27878bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
27879 SDValue &Chain) const {
27880 if (N->getNumValues() != 1)
27881 return false;
27882 if (!N->hasNUsesOfValue(1, 0))
27883 return false;
27884
27885 SDValue TCChain = Chain;
27886 SDNode *Copy = *N->user_begin();
27887 if (Copy->getOpcode() == ISD::CopyToReg) {
27888 // If the copy has a glue operand, we conservatively assume it isn't safe to
27889 // perform a tail call.
27890 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
27891 MVT::Glue)
27892 return false;
27893 TCChain = Copy->getOperand(0);
27894 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
27895 return false;
27896
27897 bool HasRet = false;
27898 for (SDNode *Node : Copy->users()) {
27899 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
27900 return false;
27901 HasRet = true;
27902 }
27903
27904 if (!HasRet)
27905 return false;
27906
27907 Chain = TCChain;
27908 return true;
27909}
27910
27911// Return whether the an instruction can potentially be optimized to a tail
27912// call. This will cause the optimizers to attempt to move, or duplicate,
27913// return instructions to help enable tail call optimizations for this
27914// instruction.
27915bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
27916 return CI->isTailCall();
27917}
27918
27919bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
27920 Register Offset, bool IsPre,
27921 MachineRegisterInfo &MRI) const {
27922 auto CstOffset = getIConstantVRegVal(Offset, MRI);
27923 if (!CstOffset || CstOffset->isZero())
27924 return false;
27925
27926 // All of the indexed addressing mode instructions take a signed 9 bit
27927 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
27928 // encodes the sign/indexing direction.
27929 return isInt<9>(CstOffset->getSExtValue());
27930}
27931
27932bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
27933 SDValue &Base,
27934 SDValue &Offset,
27935 SelectionDAG &DAG) const {
27936 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
27937 return false;
27938
27939 // Non-null if there is exactly one user of the loaded value (ignoring chain).
27940 SDNode *ValOnlyUser = nullptr;
27941 for (SDUse &U : N->uses()) {
27942 if (U.getResNo() == 1)
27943 continue; // Ignore chain.
27944 if (ValOnlyUser == nullptr)
27945 ValOnlyUser = U.getUser();
27946 else {
27947 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
27948 break;
27949 }
27950 }
27951
27952 auto IsUndefOrZero = [](SDValue V) {
27953 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
27954 };
27955
27956 // If the only user of the value is a scalable vector splat, it is
27957 // preferable to do a replicating load (ld1r*).
27958 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
27959 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
27960 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
27961 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
27962 return false;
27963
27964 Base = Op->getOperand(0);
27965 // All of the indexed addressing mode instructions take a signed
27966 // 9 bit immediate offset.
27967 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
27968 int64_t RHSC = RHS->getSExtValue();
27969 if (Op->getOpcode() == ISD::SUB)
27970 RHSC = -(uint64_t)RHSC;
27971 if (!isInt<9>(RHSC))
27972 return false;
27973 // When big-endian VLD1/VST1 are used for vector load and store, and these
27974 // only allow an offset that's equal to the store size.
27975 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
27976 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
27977 (uint64_t)RHSC != MemType.getStoreSize())
27978 return false;
27979 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
27980 // when dealing with subtraction.
27981 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
27982 return true;
27983 }
27984 return false;
27985}
27986
27987bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
27988 SDValue &Offset,
27990 SelectionDAG &DAG) const {
27991 EVT VT;
27992 SDValue Ptr;
27993 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
27994 VT = LD->getMemoryVT();
27995 Ptr = LD->getBasePtr();
27996 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
27997 VT = ST->getMemoryVT();
27998 Ptr = ST->getBasePtr();
27999 } else
28000 return false;
28001
28002 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
28003 return false;
28004 AM = ISD::PRE_INC;
28005 return true;
28006}
28007
28008bool AArch64TargetLowering::getPostIndexedAddressParts(
28010 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
28011 EVT VT;
28012 SDValue Ptr;
28013 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28014 VT = LD->getMemoryVT();
28015 Ptr = LD->getBasePtr();
28016 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28017 VT = ST->getMemoryVT();
28018 Ptr = ST->getBasePtr();
28019 } else
28020 return false;
28021
28022 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
28023 return false;
28024 // Post-indexing updates the base, so it's not a valid transform
28025 // if that's not the same as the load's pointer.
28026 if (Ptr != Base)
28027 return false;
28028 AM = ISD::POST_INC;
28029 return true;
28030}
28031
28034 SelectionDAG &DAG) {
28035 SDLoc DL(N);
28036 SDValue Op = N->getOperand(0);
28037 EVT VT = N->getValueType(0);
28038 [[maybe_unused]] EVT SrcVT = Op.getValueType();
28039 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28040 "Must be bool vector.");
28041
28042 // Special handling for Clang's __builtin_convertvector. For vectors with <8
28043 // elements, it adds a vector concatenation with undef(s). If we encounter
28044 // this here, we can skip the concat.
28045 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
28046 bool AllUndef = true;
28047 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
28048 AllUndef &= Op.getOperand(I).isUndef();
28049
28050 if (AllUndef)
28051 Op = Op.getOperand(0);
28052 }
28053
28054 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
28055 if (VectorBits)
28056 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
28057}
28058
28061 SelectionDAG &DAG, EVT ExtendVT,
28062 EVT CastVT) {
28063 SDLoc DL(N);
28064 SDValue Op = N->getOperand(0);
28065 EVT VT = N->getValueType(0);
28066
28067 // Use SCALAR_TO_VECTOR for lane zero
28068 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
28069 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
28070 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
28071 Results.push_back(
28072 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
28073}
28074
28075void AArch64TargetLowering::ReplaceBITCASTResults(
28077 SDLoc DL(N);
28078 SDValue Op = N->getOperand(0);
28079 EVT VT = N->getValueType(0);
28080 EVT SrcVT = Op.getValueType();
28081
28082 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
28083 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
28084 return;
28085 }
28086
28087 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
28088 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
28089 return;
28090 }
28091
28092 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
28093 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
28094 return;
28095 }
28096
28097 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
28098 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
28099 "Expected fp->int bitcast!");
28100
28101 // Bitcasting between unpacked vector types of different element counts is
28102 // not a NOP because the live elements are laid out differently.
28103 // 01234567
28104 // e.g. nxv2i32 = XX??XX??
28105 // nxv4f16 = X?X?X?X?
28106 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
28107 return;
28108
28109 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
28110 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
28111 return;
28112 }
28113
28114 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28115 !VT.isVector())
28116 return replaceBoolVectorBitcast(N, Results, DAG);
28117
28118 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
28119 return;
28120
28121 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
28122 DAG.getUNDEF(MVT::i32), Op);
28123 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
28124 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
28125}
28126
28128 SelectionDAG &DAG,
28129 const AArch64Subtarget *Subtarget) {
28130 EVT VT = N->getValueType(0);
28131 if (!VT.is256BitVector() ||
28133 !N->getFlags().hasAllowReassociation()) ||
28134 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
28135 VT.getScalarType() == MVT::bf16)
28136 return;
28137
28138 SDValue X = N->getOperand(0);
28139 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
28140 if (!Shuf) {
28141 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
28142 X = N->getOperand(1);
28143 if (!Shuf)
28144 return;
28145 }
28146
28147 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
28148 return;
28149
28150 // Check the mask is 1,0,3,2,5,4,...
28151 ArrayRef<int> Mask = Shuf->getMask();
28152 for (int I = 0, E = Mask.size(); I < E; I++)
28153 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
28154 return;
28155
28156 SDLoc DL(N);
28157 auto LoHi = DAG.SplitVector(X, DL);
28158 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
28159 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
28160 LoHi.first, LoHi.second);
28161
28162 // Shuffle the elements back into order.
28163 SmallVector<int> NMask;
28164 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
28165 NMask.push_back(I);
28166 NMask.push_back(I);
28167 }
28168 Results.push_back(
28169 DAG.getVectorShuffle(VT, DL,
28170 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
28171 DAG.getUNDEF(LoHi.first.getValueType())),
28172 DAG.getUNDEF(VT), NMask));
28173}
28174
28177 SelectionDAG &DAG, unsigned InterOp,
28178 unsigned AcrossOp) {
28179 EVT LoVT, HiVT;
28180 SDValue Lo, Hi;
28181 SDLoc DL(N);
28182 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
28183 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28184 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
28185 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
28186 Results.push_back(SplitVal);
28187}
28188
28189void AArch64TargetLowering::ReplaceExtractSubVectorResults(
28191 SDValue In = N->getOperand(0);
28192 EVT InVT = In.getValueType();
28193
28194 // Common code will handle these just fine.
28195 if (!InVT.isScalableVector() || !InVT.isInteger())
28196 return;
28197
28198 SDLoc DL(N);
28199 EVT VT = N->getValueType(0);
28200
28201 // The following checks bail if this is not a halving operation.
28202
28203 ElementCount ResEC = VT.getVectorElementCount();
28204
28205 if (InVT.getVectorElementCount() != (ResEC * 2))
28206 return;
28207
28208 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
28209 if (!CIndex)
28210 return;
28211
28212 unsigned Index = CIndex->getZExtValue();
28213 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
28214 return;
28215
28216 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
28217 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28218
28219 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
28220 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
28221}
28222
28223void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
28225 assert((Subtarget->hasSVE2p1() ||
28226 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
28227 "Custom lower of get.active.lane.mask missing required feature.");
28228
28229 assert(N->getValueType(0) == MVT::nxv32i1 &&
28230 "Unexpected result type for get.active.lane.mask");
28231
28232 SDLoc DL(N);
28233 SDValue Idx = N->getOperand(0);
28234 SDValue TC = N->getOperand(1);
28235
28236 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
28237 "Unexpected operand type for get.active.lane.mask");
28238
28239 if (Idx.getValueType() != MVT::i64) {
28240 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
28241 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
28242 }
28243
28244 SDValue ID =
28245 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
28246 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
28247 auto WideMask =
28248 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
28249
28250 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
28251 {WideMask.getValue(0), WideMask.getValue(1)}));
28252}
28253
28254// Create an even/odd pair of X registers holding integer value V.
28256 SDLoc DL(V.getNode());
28257 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
28258 if (DAG.getDataLayout().isBigEndian())
28259 std::swap (VLo, VHi);
28260 SDValue RegClass =
28261 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
28262 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
28263 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
28264 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
28265 return SDValue(
28266 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
28267}
28268
28271 SelectionDAG &DAG,
28272 const AArch64Subtarget *Subtarget) {
28273 assert(N->getValueType(0) == MVT::i128 &&
28274 "AtomicCmpSwap on types less than 128 should be legal");
28275
28276 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28277 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
28278 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
28279 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
28280 SDValue Ops[] = {
28281 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
28282 createGPRPairNode(DAG, N->getOperand(3)), // Store value
28283 N->getOperand(1), // Ptr
28284 N->getOperand(0), // Chain in
28285 };
28286
28287 unsigned Opcode;
28288 switch (MemOp->getMergedOrdering()) {
28290 Opcode = AArch64::CASPX;
28291 break;
28293 Opcode = AArch64::CASPAX;
28294 break;
28296 Opcode = AArch64::CASPLX;
28297 break;
28300 Opcode = AArch64::CASPALX;
28301 break;
28302 default:
28303 llvm_unreachable("Unexpected ordering!");
28304 }
28305
28306 MachineSDNode *CmpSwap = DAG.getMachineNode(
28307 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
28308 DAG.setNodeMemRefs(CmpSwap, {MemOp});
28309
28310 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
28311 if (DAG.getDataLayout().isBigEndian())
28312 std::swap(SubReg1, SubReg2);
28313 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
28314 SDValue(CmpSwap, 0));
28315 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
28316 SDValue(CmpSwap, 0));
28317 Results.push_back(
28318 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28319 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
28320 return;
28321 }
28322
28323 unsigned Opcode;
28324 switch (MemOp->getMergedOrdering()) {
28326 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
28327 break;
28329 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
28330 break;
28332 Opcode = AArch64::CMP_SWAP_128_RELEASE;
28333 break;
28336 Opcode = AArch64::CMP_SWAP_128;
28337 break;
28338 default:
28339 llvm_unreachable("Unexpected ordering!");
28340 }
28341
28342 SDLoc DL(N);
28343 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
28344 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
28345 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
28346 New.first, New.second, N->getOperand(0)};
28347 SDNode *CmpSwap = DAG.getMachineNode(
28348 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
28349 Ops);
28350 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
28351
28352 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28353 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
28354 Results.push_back(SDValue(CmpSwap, 3));
28355}
28356
28357static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
28358 AtomicOrdering Ordering) {
28359 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
28360 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
28361 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
28362 // ATOMIC_LOAD_CLR at any point.
28363 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
28364 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
28365 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
28366 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
28367
28368 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28369 // The operand will need to be XORed in a separate step.
28370 switch (Ordering) {
28372 return AArch64::LDCLRP;
28373 break;
28375 return AArch64::LDCLRPA;
28376 break;
28378 return AArch64::LDCLRPL;
28379 break;
28382 return AArch64::LDCLRPAL;
28383 break;
28384 default:
28385 llvm_unreachable("Unexpected ordering!");
28386 }
28387 }
28388
28389 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
28390 switch (Ordering) {
28392 return AArch64::LDSETP;
28393 break;
28395 return AArch64::LDSETPA;
28396 break;
28398 return AArch64::LDSETPL;
28399 break;
28402 return AArch64::LDSETPAL;
28403 break;
28404 default:
28405 llvm_unreachable("Unexpected ordering!");
28406 }
28407 }
28408
28409 if (ISDOpcode == ISD::ATOMIC_SWAP) {
28410 switch (Ordering) {
28412 return AArch64::SWPP;
28413 break;
28415 return AArch64::SWPPA;
28416 break;
28418 return AArch64::SWPPL;
28419 break;
28422 return AArch64::SWPPAL;
28423 break;
28424 default:
28425 llvm_unreachable("Unexpected ordering!");
28426 }
28427 }
28428
28429 llvm_unreachable("Unexpected ISDOpcode!");
28430}
28431
28434 SelectionDAG &DAG,
28435 const AArch64Subtarget *Subtarget) {
28436 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
28437 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
28438 // rather than the CASP instructions, because CASP has register classes for
28439 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
28440 // to present them as single operands. LSE128 instructions use the GPR64
28441 // register class (because the pair does not have to be sequential), like
28442 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
28443
28444 assert(N->getValueType(0) == MVT::i128 &&
28445 "AtomicLoadXXX on types less than 128 should be legal");
28446
28447 if (!Subtarget->hasLSE128())
28448 return;
28449
28450 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28451 const SDValue &Chain = N->getOperand(0);
28452 const SDValue &Ptr = N->getOperand(1);
28453 const SDValue &Val128 = N->getOperand(2);
28454 std::pair<SDValue, SDValue> Val2x64 =
28455 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
28456
28457 const unsigned ISDOpcode = N->getOpcode();
28458 const unsigned MachineOpcode =
28459 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
28460
28461 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28462 SDLoc DL(Val128);
28463 Val2x64.first =
28464 DAG.getNode(ISD::XOR, DL, MVT::i64,
28465 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
28466 Val2x64.second =
28467 DAG.getNode(ISD::XOR, DL, MVT::i64,
28468 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
28469 }
28470
28471 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
28472 if (DAG.getDataLayout().isBigEndian())
28473 std::swap(Ops[0], Ops[1]);
28474
28475 MachineSDNode *AtomicInst =
28476 DAG.getMachineNode(MachineOpcode, SDLoc(N),
28477 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
28478
28479 DAG.setNodeMemRefs(AtomicInst, {MemOp});
28480
28481 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
28482 if (DAG.getDataLayout().isBigEndian())
28483 std::swap(Lo, Hi);
28484
28485 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28486 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
28487}
28488
28489void AArch64TargetLowering::ReplaceNodeResults(
28491 switch (N->getOpcode()) {
28492 default:
28493 llvm_unreachable("Don't know how to custom expand this");
28494 case ISD::BITCAST:
28495 ReplaceBITCASTResults(N, Results, DAG);
28496 return;
28497 case ISD::VECREDUCE_ADD:
28498 case ISD::VECREDUCE_SMAX:
28499 case ISD::VECREDUCE_SMIN:
28500 case ISD::VECREDUCE_UMAX:
28501 case ISD::VECREDUCE_UMIN:
28502 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
28503 return;
28505 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
28506 Results.push_back(Res);
28507 return;
28508 case ISD::ADD:
28509 case ISD::FADD:
28510 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
28511 return;
28512
28513 case ISD::CTPOP:
28514 case ISD::PARITY:
28515 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
28516 Results.push_back(Result);
28517 return;
28518 case AArch64ISD::SADDV:
28519 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
28520 return;
28521 case AArch64ISD::UADDV:
28522 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
28523 return;
28524 case AArch64ISD::SMINV:
28525 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
28526 return;
28527 case AArch64ISD::UMINV:
28528 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
28529 return;
28530 case AArch64ISD::SMAXV:
28531 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
28532 return;
28533 case AArch64ISD::UMAXV:
28534 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
28535 return;
28536 case ISD::MULHS:
28538 Results.push_back(
28539 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
28540 return;
28541 case ISD::MULHU:
28543 Results.push_back(
28544 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
28545 return;
28546 case ISD::FP_TO_UINT:
28547 case ISD::FP_TO_SINT:
28550 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
28551 // Let normal code take care of it by not adding anything to Results.
28552 return;
28553 case ISD::ATOMIC_CMP_SWAP:
28554 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
28555 return;
28556 case ISD::ATOMIC_LOAD_CLR:
28557 assert(N->getValueType(0) != MVT::i128 &&
28558 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
28559 break;
28560 case ISD::ATOMIC_LOAD_AND:
28561 case ISD::ATOMIC_LOAD_OR:
28562 case ISD::ATOMIC_SWAP: {
28563 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
28564 "Expected 128-bit atomicrmw.");
28565 // These need custom type legalisation so we go directly to instruction.
28566 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
28567 return;
28568 }
28569 case ISD::ADDRSPACECAST: {
28570 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
28571 Results.push_back(V);
28572 return;
28573 }
28574 case ISD::ATOMIC_LOAD:
28575 case ISD::LOAD: {
28576 MemSDNode *LoadNode = cast<MemSDNode>(N);
28577 EVT MemVT = LoadNode->getMemoryVT();
28578 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
28579 // targets.
28580 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
28581 MemVT.getSizeInBits() == 256u &&
28582 (MemVT.getScalarSizeInBits() == 8u ||
28583 MemVT.getScalarSizeInBits() == 16u ||
28584 MemVT.getScalarSizeInBits() == 32u ||
28585 MemVT.getScalarSizeInBits() == 64u)) {
28586
28587 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
28589 AArch64ISD::LDNP, SDLoc(N),
28590 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
28591 {LoadNode->getChain(), LoadNode->getBasePtr()},
28592 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28593
28594 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
28595 DAG.getBitcast(HalfVT, Result.getValue(0)),
28596 DAG.getBitcast(HalfVT, Result.getValue(1)));
28597 Results.append({Pair, Result.getValue(2) /* Chain */});
28598 return;
28599 }
28600
28601 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
28602 LoadNode->getMemoryVT() != MVT::i128) {
28603 // Non-volatile or atomic loads are optimized later in AArch64's load/store
28604 // optimizer.
28605 return;
28606 }
28607
28608 if (SDValue(N, 0).getValueType() == MVT::i128) {
28609 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
28610 bool isLoadAcquire =
28612 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
28613
28614 if (isLoadAcquire)
28615 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
28616
28618 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28619 {LoadNode->getChain(), LoadNode->getBasePtr()},
28620 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28621
28622 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
28623
28624 SDValue Pair =
28625 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
28626 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
28627 Results.append({Pair, Result.getValue(2) /* Chain */});
28628 }
28629 return;
28630 }
28632 ReplaceExtractSubVectorResults(N, Results, DAG);
28633 return;
28636 // Custom lowering has been requested for INSERT_SUBVECTOR and
28637 // CONCAT_VECTORS -- but delegate to common code for result type
28638 // legalisation
28639 return;
28640 case ISD::GET_ACTIVE_LANE_MASK:
28641 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
28642 return;
28644 EVT VT = N->getValueType(0);
28645
28646 Intrinsic::ID IntID =
28647 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
28648 switch (IntID) {
28649 default:
28650 return;
28651 case Intrinsic::aarch64_sve_clasta_n: {
28652 assert((VT == MVT::i8 || VT == MVT::i16) &&
28653 "custom lowering for unexpected type");
28654 SDLoc DL(N);
28655 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28656 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
28657 N->getOperand(1), Op2, N->getOperand(3));
28658 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28659 return;
28660 }
28661 case Intrinsic::aarch64_sve_clastb_n: {
28662 assert((VT == MVT::i8 || VT == MVT::i16) &&
28663 "custom lowering for unexpected type");
28664 SDLoc DL(N);
28665 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28666 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
28667 N->getOperand(1), Op2, N->getOperand(3));
28668 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28669 return;
28670 }
28671 case Intrinsic::aarch64_sve_lasta: {
28672 assert((VT == MVT::i8 || VT == MVT::i16) &&
28673 "custom lowering for unexpected type");
28674 SDLoc DL(N);
28675 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
28676 N->getOperand(1), N->getOperand(2));
28677 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28678 return;
28679 }
28680 case Intrinsic::aarch64_sve_lastb: {
28681 assert((VT == MVT::i8 || VT == MVT::i16) &&
28682 "custom lowering for unexpected type");
28683 SDLoc DL(N);
28684 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
28685 N->getOperand(1), N->getOperand(2));
28686 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28687 return;
28688 }
28689 case Intrinsic::aarch64_sme_in_streaming_mode: {
28690 SDLoc DL(N);
28691 SDValue Chain = DAG.getEntryNode();
28692
28693 SDValue RuntimePStateSM =
28694 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
28695 Results.push_back(
28696 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
28697 return;
28698 }
28699 case Intrinsic::experimental_vector_match: {
28700 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
28701 return;
28702
28703 // NOTE: Only trivial type promotion is supported.
28704 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
28705 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
28706 return;
28707
28708 SDLoc DL(N);
28709 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
28710 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28711 return;
28712 }
28713 }
28714 }
28715 case ISD::READ_REGISTER: {
28716 SDLoc DL(N);
28717 assert(N->getValueType(0) == MVT::i128 &&
28718 "READ_REGISTER custom lowering is only for 128-bit sysregs");
28719 SDValue Chain = N->getOperand(0);
28720 SDValue SysRegName = N->getOperand(1);
28721
28722 SDValue Result = DAG.getNode(
28723 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28724 Chain, SysRegName);
28725
28726 // Sysregs are not endian. Result.getValue(0) always contains the lower half
28727 // of the 128-bit System Register value.
28728 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28729 Result.getValue(0), Result.getValue(1));
28730 Results.push_back(Pair);
28731 Results.push_back(Result.getValue(2)); // Chain
28732 return;
28733 }
28734 }
28735}
28736
28738 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
28740 return true;
28741}
28742
28744 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
28745 // reciprocal if there are three or more FDIVs.
28746 return 3;
28747}
28748
28751 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
28752 // v4i16, v2i32 instead of to promote.
28753 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
28754 VT == MVT::v1f32)
28755 return TypeWidenVector;
28756
28758}
28759
28760// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
28761// provided the address is 16-byte aligned.
28763 if (!Subtarget->hasLSE2())
28764 return false;
28765
28766 if (auto LI = dyn_cast<LoadInst>(I))
28767 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28768 LI->getAlign() >= Align(16);
28769
28770 if (auto SI = dyn_cast<StoreInst>(I))
28771 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28772 SI->getAlign() >= Align(16);
28773
28774 return false;
28775}
28776
28778 if (!Subtarget->hasLSE128())
28779 return false;
28780
28781 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
28782 // will clobber the two registers.
28783 if (const auto *SI = dyn_cast<StoreInst>(I))
28784 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28785 SI->getAlign() >= Align(16) &&
28786 (SI->getOrdering() == AtomicOrdering::Release ||
28787 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
28788
28789 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
28790 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28791 RMW->getAlign() >= Align(16) &&
28792 (RMW->getOperation() == AtomicRMWInst::Xchg ||
28793 RMW->getOperation() == AtomicRMWInst::And ||
28794 RMW->getOperation() == AtomicRMWInst::Or);
28795
28796 return false;
28797}
28798
28800 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
28801 return false;
28802
28803 if (auto LI = dyn_cast<LoadInst>(I))
28804 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28805 LI->getAlign() >= Align(16) &&
28806 LI->getOrdering() == AtomicOrdering::Acquire;
28807
28808 if (auto SI = dyn_cast<StoreInst>(I))
28809 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28810 SI->getAlign() >= Align(16) &&
28811 SI->getOrdering() == AtomicOrdering::Release;
28812
28813 return false;
28814}
28815
28817 const Instruction *I) const {
28819 return false;
28821 return false;
28823 return true;
28824 return false;
28825}
28826
28828 const Instruction *I) const {
28829 // Store-Release instructions only provide seq_cst guarantees when paired with
28830 // Load-Acquire instructions. MSVC CRT does not use these instructions to
28831 // implement seq_cst loads and stores, so we need additional explicit fences
28832 // after memory writes.
28833 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28834 return false;
28835
28836 switch (I->getOpcode()) {
28837 default:
28838 return false;
28839 case Instruction::AtomicCmpXchg:
28840 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
28842 case Instruction::AtomicRMW:
28843 return cast<AtomicRMWInst>(I)->getOrdering() ==
28845 case Instruction::Store:
28846 return cast<StoreInst>(I)->getOrdering() ==
28848 }
28849}
28850
28851// Loads and stores less than 128-bits are already atomic; ones above that
28852// are doomed anyway, so defer to the default libcall and blame the OS when
28853// things go wrong.
28856 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
28857 if (Size != 128)
28866}
28867
28868// Loads and stores less than 128-bits are already atomic; ones above that
28869// are doomed anyway, so defer to the default libcall and blame the OS when
28870// things go wrong.
28873 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
28874
28875 if (Size != 128)
28877 if (isOpSuitableForRCPC3(LI))
28879 // No LSE128 loads
28880 if (isOpSuitableForLDPSTP(LI))
28882
28883 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28884 // implement atomicrmw without spilling. If the target address is also on the
28885 // stack and close enough to the spill slot, this can lead to a situation
28886 // where the monitor always gets cleared and the atomic operation can never
28887 // succeed. So at -O0 lower this operation to a CAS loop.
28888 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28890
28891 // Using CAS for an atomic load has a better chance of succeeding under high
28892 // contention situations. So use it if available.
28893 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
28895}
28896
28897// Return true if the atomic operation expansion will lower to use a library
28898// call, and is thus ineligible to use an LLSC expansion.
28899static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
28900 const AtomicRMWInst *RMW) {
28901 if (!RMW->isFloatingPointOperation())
28902 return false;
28903 switch (RMW->getType()->getScalarType()->getTypeID()) {
28904 case Type::FloatTyID:
28905 case Type::DoubleTyID:
28906 case Type::HalfTyID:
28907 case Type::BFloatTyID:
28908 // Will use soft float
28909 return !Subtarget.hasFPARMv8();
28910 default:
28911 // fp128 will emit library calls.
28912 return true;
28913 }
28914
28915 llvm_unreachable("covered type switch");
28916}
28917
28918// The "default" for integer RMW operations is to expand to an LL/SC loop.
28919// However, with the LSE instructions (or outline-atomics mode, which provides
28920// library routines in place of the LSE-instructions), we can directly emit many
28921// operations instead.
28924 Type *Ty = AI->getType();
28925 unsigned Size = Ty->getPrimitiveSizeInBits();
28926 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
28927
28928 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
28932 if (CanUseLSE128)
28934
28935 // If LSFE available, use atomic FP instructions in preference to expansion
28936 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
28942
28943 // Nand is not supported in LSE.
28944 // Leave 128 bits to LLSC or CmpXChg.
28945 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
28946 !AI->isFloatingPointOperation()) {
28947 if (Subtarget->hasLSE())
28949 if (Subtarget->outlineAtomics()) {
28950 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
28951 // Don't outline them unless
28952 // (1) high level <atomic> support approved:
28953 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
28954 // (2) low level libgcc and compiler-rt support implemented by:
28955 // min/max outline atomics helpers
28956 if (AI->getOperation() != AtomicRMWInst::Min &&
28961 }
28962 }
28963 }
28964
28965 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28966 // implement atomicrmw without spilling. If the target address is also on the
28967 // stack and close enough to the spill slot, this can lead to a situation
28968 // where the monitor always gets cleared and the atomic operation can never
28969 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
28970 // we have a single CAS instruction that can replace the loop.
28971 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
28972 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
28974
28976}
28977
28980 AtomicCmpXchgInst *AI) const {
28981 // If subtarget has LSE, leave cmpxchg intact for codegen.
28982 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
28984 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28985 // implement cmpxchg without spilling. If the address being exchanged is also
28986 // on the stack and close enough to the spill slot, this can lead to a
28987 // situation where the monitor always gets cleared and the atomic operation
28988 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
28989 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28991
28992 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
28993 // it.
28995 if (Size > 64)
28997
28999}
29000
29002 Type *ValueTy, Value *Addr,
29003 AtomicOrdering Ord) const {
29004 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29005 bool IsAcquire = isAcquireOrStronger(Ord);
29006
29007 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
29008 // intrinsic must return {i64, i64} and we have to recombine them into a
29009 // single i128 here.
29010 if (ValueTy->getPrimitiveSizeInBits() == 128) {
29012 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
29013
29014 Value *LoHi =
29015 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
29016
29017 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
29018 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
29019
29020 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
29021 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
29022 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
29023
29024 Value *Or = Builder.CreateOr(
29025 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
29026 return Builder.CreateBitCast(Or, ValueTy);
29027 }
29028
29029 Type *Tys[] = { Addr->getType() };
29031 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
29032
29033 const DataLayout &DL = M->getDataLayout();
29034 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
29035 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
29036 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
29037 Attribute::ElementType, IntEltTy));
29038 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
29039
29040 return Builder.CreateBitCast(Trunc, ValueTy);
29041}
29042
29044 IRBuilderBase &Builder) const {
29045 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
29046}
29047
29049 Value *Val, Value *Addr,
29050 AtomicOrdering Ord) const {
29051 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29052 bool IsRelease = isReleaseOrStronger(Ord);
29053
29054 // Since the intrinsics must have legal type, the i128 intrinsics take two
29055 // parameters: "i64, i64". We must marshal Val into the appropriate form
29056 // before the call.
29057 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
29059 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
29061 Type *Int64Ty = Type::getInt64Ty(M->getContext());
29062 Type *Int128Ty = Type::getInt128Ty(M->getContext());
29063
29064 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
29065
29066 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
29067 Value *Hi =
29068 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
29069 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
29070 }
29071
29073 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
29074 Type *Tys[] = { Addr->getType() };
29076
29077 const DataLayout &DL = M->getDataLayout();
29078 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
29079 Val = Builder.CreateBitCast(Val, IntValTy);
29080
29081 CallInst *CI = Builder.CreateCall(
29082 Stxr, {Builder.CreateZExtOrBitCast(
29083 Val, Stxr->getFunctionType()->getParamType(0)),
29084 Addr});
29085 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
29086 Attribute::ElementType, Val->getType()));
29087 return CI;
29088}
29089
29091 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
29092 const DataLayout &DL) const {
29093 if (!Ty->isArrayTy()) {
29094 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
29095 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
29096 }
29097
29098 // All non aggregate members of the type must have the same type
29099 SmallVector<EVT> ValueVTs;
29100 ComputeValueVTs(*this, DL, Ty, ValueVTs);
29101 return all_equal(ValueVTs);
29102}
29103
29104bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
29105 EVT) const {
29106 return false;
29107}
29108
29109static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
29110 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
29111 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
29112 M, Intrinsic::thread_pointer, IRB.getPtrTy());
29113 return IRB.CreatePointerCast(
29114 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
29115 Offset),
29116 IRB.getPtrTy(0));
29117}
29118
29120 // Android provides a fixed TLS slot for the stack cookie. See the definition
29121 // of TLS_SLOT_STACK_GUARD in
29122 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
29123 if (Subtarget->isTargetAndroid())
29124 return UseTlsOffset(IRB, 0x28);
29125
29126 // Fuchsia is similar.
29127 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
29128 if (Subtarget->isTargetFuchsia())
29129 return UseTlsOffset(IRB, -0x10);
29130
29132}
29133
29135 // MSVC CRT provides functionalities for stack protection.
29136 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29137 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29138
29139 RTLIB::LibcallImpl SecurityCookieVar =
29140 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
29141 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
29142 SecurityCookieVar != RTLIB::Unsupported) {
29143 // MSVC CRT has a global variable holding security cookie.
29144 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
29145 PointerType::getUnqual(M.getContext()));
29146
29147 // MSVC CRT has a function to validate security cookie.
29148 FunctionCallee SecurityCheckCookie =
29149 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
29150 Type::getVoidTy(M.getContext()),
29151 PointerType::getUnqual(M.getContext()));
29152 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
29153 F->setCallingConv(CallingConv::Win64);
29154 F->addParamAttr(0, Attribute::AttrKind::InReg);
29155 }
29156 return;
29157 }
29159}
29160
29162 // MSVC CRT has a function to validate security cookie.
29163 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29164 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29165 if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
29166 return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
29168}
29169
29170Value *
29172 // Android provides a fixed TLS slot for the SafeStack pointer. See the
29173 // definition of TLS_SLOT_SAFESTACK in
29174 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
29175 if (Subtarget->isTargetAndroid())
29176 return UseTlsOffset(IRB, 0x48);
29177
29178 // Fuchsia is similar.
29179 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
29180 if (Subtarget->isTargetFuchsia())
29181 return UseTlsOffset(IRB, -0x8);
29182
29184}
29185
29186/// If a physical register, this returns the register that receives the
29187/// exception address on entry to an EH pad.
29189 const Constant *PersonalityFn) const {
29190 // FIXME: This is a guess. Has this been defined yet?
29191 return AArch64::X0;
29192}
29193
29194/// If a physical register, this returns the register that receives the
29195/// exception typeid on entry to a landing pad.
29197 const Constant *PersonalityFn) const {
29198 // FIXME: This is a guess. Has this been defined yet?
29199 return AArch64::X1;
29200}
29201
29203 const Instruction &AndI) const {
29204 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
29205 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
29206 // may be beneficial to sink in other cases, but we would have to check that
29207 // the cmp would not get folded into the br to form a cbz for these to be
29208 // beneficial.
29210 if (!Mask)
29211 return false;
29212 return Mask->getValue().isPowerOf2();
29213}
29214
29218 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
29219 SelectionDAG &DAG) const {
29220 // Does baseline recommend not to perform the fold by default?
29222 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
29223 return false;
29224 // Else, if this is a vector shift, prefer 'shl'.
29225 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
29226}
29227
29230 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
29232 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
29235 ExpansionFactor);
29236}
29237
29239 // Update IsSplitCSR in AArch64unctionInfo.
29240 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
29241 AFI->setIsSplitCSR(true);
29242}
29243
29245 MachineBasicBlock *Entry,
29246 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
29247 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
29248 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
29249 if (!IStart)
29250 return;
29251
29252 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
29253 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
29254 MachineBasicBlock::iterator MBBI = Entry->begin();
29255 for (const MCPhysReg *I = IStart; *I; ++I) {
29256 const TargetRegisterClass *RC = nullptr;
29257 if (AArch64::GPR64RegClass.contains(*I))
29258 RC = &AArch64::GPR64RegClass;
29259 else if (AArch64::FPR64RegClass.contains(*I))
29260 RC = &AArch64::FPR64RegClass;
29261 else
29262 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
29263
29264 Register NewVR = MRI->createVirtualRegister(RC);
29265 // Create copy from CSR to a virtual register.
29266 // FIXME: this currently does not emit CFI pseudo-instructions, it works
29267 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
29268 // nounwind. If we want to generalize this later, we may need to emit
29269 // CFI pseudo-instructions.
29270 assert(Entry->getParent()->getFunction().hasFnAttribute(
29271 Attribute::NoUnwind) &&
29272 "Function should be nounwind in insertCopiesSplitCSR!");
29273 Entry->addLiveIn(*I);
29274 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
29275 .addReg(*I);
29276
29277 // Insert the copy-back instructions right before the terminator.
29278 for (auto *Exit : Exits)
29279 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
29280 TII->get(TargetOpcode::COPY), *I)
29281 .addReg(NewVR);
29282 }
29283}
29284
29285bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
29286 // Integer division on AArch64 is expensive. However, when aggressively
29287 // optimizing for code size, we prefer to use a div instruction, as it is
29288 // usually smaller than the alternative sequence.
29289 // The exception to this is vector division. Since AArch64 doesn't have vector
29290 // integer division, leaving the division as-is is a loss even in terms of
29291 // size, because it will have to be scalarized, while the alternative code
29292 // sequence can be performed in vector form.
29293 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
29294 return OptSize && !VT.isVector();
29295}
29296
29298 const MachineFunction &MF) const {
29299 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
29300 // In future, we could allow this when SVE is available, but currently,
29301 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
29302 // the general lowering may introduce stack spills/reloads).
29303 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
29304 return false;
29305
29306 // Do not merge to float value size (128 bytes) if no implicit float attribute
29307 // is set.
29308 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
29309 return !NoFloat || MemVT.getSizeInBits() <= 64;
29310}
29311
29313 // We want inc-of-add for scalars and sub-of-not for vectors.
29314 return VT.isScalarInteger();
29315}
29316
29318 EVT VT) const {
29319 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
29320 // legalize.
29321 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
29322 return false;
29323 if (FPVT == MVT::v8bf16)
29324 return false;
29325 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
29326}
29327
29329 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
29330 // avoid vselect becoming bsl / unrolling.
29331 return !VT.isFixedLengthVector();
29332}
29333
29337 const TargetInstrInfo *TII) const {
29338 assert(MBBI->isCall() && MBBI->getCFIType() &&
29339 "Invalid call instruction for a KCFI check");
29340
29341 switch (MBBI->getOpcode()) {
29342 case AArch64::BLR:
29343 case AArch64::BLRNoIP:
29344 case AArch64::TCRETURNri:
29345 case AArch64::TCRETURNrix16x17:
29346 case AArch64::TCRETURNrix17:
29347 case AArch64::TCRETURNrinotx16:
29348 break;
29349 default:
29350 llvm_unreachable("Unexpected CFI call opcode");
29351 }
29352
29353 MachineOperand &Target = MBBI->getOperand(0);
29354 assert(Target.isReg() && "Invalid target operand for an indirect call");
29355 Target.setIsRenamable(false);
29356
29357 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
29358 .addReg(Target.getReg())
29359 .addImm(MBBI->getCFIType())
29360 .getInstr();
29361}
29362
29364 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
29365}
29366
29367unsigned
29369 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
29370 return getPointerTy(DL).getSizeInBits();
29371
29372 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
29373}
29374
29375void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
29376 MachineFrameInfo &MFI = MF.getFrameInfo();
29377 // If we have any vulnerable SVE stack objects then the stack protector
29378 // needs to be placed at the top of the SVE stack area, as the SVE locals
29379 // are placed above the other locals, so we allocate it as if it were a
29380 // scalable vector.
29381 // FIXME: It may be worthwhile having a specific interface for this rather
29382 // than doing it here in finalizeLowering.
29383 if (MFI.hasStackProtectorIndex()) {
29384 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
29390 break;
29391 }
29392 }
29393 }
29396}
29397
29398// Unlike X86, we let frame lowering assign offsets to all catch objects.
29400
29401bool AArch64TargetLowering::shouldLocalize(
29402 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
29403 auto &MF = *MI.getMF();
29404 auto &MRI = MF.getRegInfo();
29405 auto maxUses = [](unsigned RematCost) {
29406 // A cost of 1 means remats are basically free.
29407 if (RematCost == 1)
29408 return std::numeric_limits<unsigned>::max();
29409 if (RematCost == 2)
29410 return 2U;
29411
29412 // Remat is too expensive, only sink if there's one user.
29413 if (RematCost > 2)
29414 return 1U;
29415 llvm_unreachable("Unexpected remat cost");
29416 };
29417
29418 unsigned Opc = MI.getOpcode();
29419 switch (Opc) {
29420 case TargetOpcode::G_GLOBAL_VALUE: {
29421 // On Darwin, TLS global vars get selected into function calls, which
29422 // we don't want localized, as they can get moved into the middle of a
29423 // another call sequence.
29424 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
29425 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
29426 return false;
29427 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
29428 }
29429 case TargetOpcode::G_FCONSTANT:
29430 case TargetOpcode::G_CONSTANT: {
29431 const ConstantInt *CI;
29432 unsigned AdditionalCost = 0;
29433
29434 if (Opc == TargetOpcode::G_CONSTANT)
29435 CI = MI.getOperand(1).getCImm();
29436 else {
29437 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29438 // We try to estimate cost of 32/64b fpimms, as they'll likely be
29439 // materialized as integers.
29440 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
29441 break;
29442 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
29443 bool OptForSize = MF.getFunction().hasOptSize();
29445 OptForSize))
29446 return true; // Constant should be cheap.
29447 CI =
29448 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
29449 // FP materialization also costs an extra move, from gpr to fpr.
29450 AdditionalCost = 1;
29451 }
29452 APInt Imm = CI->getValue();
29455 assert(Cost.isValid() && "Expected a valid imm cost");
29456
29457 unsigned RematCost = Cost.getValue();
29458 RematCost += AdditionalCost;
29459 Register Reg = MI.getOperand(0).getReg();
29460 unsigned MaxUses = maxUses(RematCost);
29461 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
29462 if (MaxUses == std::numeric_limits<unsigned>::max())
29463 --MaxUses;
29464 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
29465 }
29466 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
29467 // localizable.
29468 case AArch64::ADRP:
29469 case AArch64::G_ADD_LOW:
29470 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
29471 case TargetOpcode::G_PTR_ADD:
29472 return true;
29473 default:
29474 break;
29475 }
29477}
29478
29480 // Fallback for scalable vectors.
29481 // Note that if EnableSVEGISel is true, we allow scalable vector types for
29482 // all instructions, regardless of whether they are actually supported.
29483 if (!EnableSVEGISel) {
29484 if (Inst.getType()->isScalableTy()) {
29485 return true;
29486 }
29487
29488 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
29489 if (Inst.getOperand(i)->getType()->isScalableTy())
29490 return true;
29491
29492 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
29493 if (AI->getAllocatedType()->isScalableTy())
29494 return true;
29495 }
29496 }
29497
29498 // Checks to allow the use of SME instructions
29499 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
29500 auto CallAttrs = SMECallAttrs(*Base, this);
29501 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
29502 CallAttrs.requiresPreservingZT0() ||
29503 CallAttrs.requiresPreservingAllZAState())
29504 return true;
29505 }
29506 return false;
29507}
29508
29509// Return the largest legal scalable vector type that matches VT's element type.
29513 "Expected legal fixed length vector!");
29514 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29515 default:
29516 llvm_unreachable("unexpected element type for SVE container");
29517 case MVT::i8:
29518 return EVT(MVT::nxv16i8);
29519 case MVT::i16:
29520 return EVT(MVT::nxv8i16);
29521 case MVT::i32:
29522 return EVT(MVT::nxv4i32);
29523 case MVT::i64:
29524 return EVT(MVT::nxv2i64);
29525 case MVT::bf16:
29526 return EVT(MVT::nxv8bf16);
29527 case MVT::f16:
29528 return EVT(MVT::nxv8f16);
29529 case MVT::f32:
29530 return EVT(MVT::nxv4f32);
29531 case MVT::f64:
29532 return EVT(MVT::nxv2f64);
29533 }
29534}
29535
29536// Return a predicate with active lanes corresponding to the extent of VT.
29538 EVT VT) {
29541 "Expected legal fixed length vector!");
29542
29543 std::optional<unsigned> PgPattern =
29545 assert(PgPattern && "Unexpected element count for SVE predicate");
29546
29547 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
29548 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
29549 // variants of instructions when available.
29550 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29551 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29552 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29553 if (MaxSVESize && MinSVESize == MaxSVESize &&
29554 MaxSVESize == VT.getSizeInBits())
29555 PgPattern = AArch64SVEPredPattern::all;
29556
29557 MVT MaskVT;
29558 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29559 default:
29560 llvm_unreachable("unexpected element type for SVE predicate");
29561 case MVT::i8:
29562 MaskVT = MVT::nxv16i1;
29563 break;
29564 case MVT::i16:
29565 case MVT::f16:
29566 case MVT::bf16:
29567 MaskVT = MVT::nxv8i1;
29568 break;
29569 case MVT::i32:
29570 case MVT::f32:
29571 MaskVT = MVT::nxv4i1;
29572 break;
29573 case MVT::i64:
29574 case MVT::f64:
29575 MaskVT = MVT::nxv2i1;
29576 break;
29577 }
29578
29579 return getPTrue(DAG, DL, MaskVT, *PgPattern);
29580}
29581
29583 EVT VT) {
29585 "Expected legal scalable vector!");
29586 auto PredTy = VT.changeVectorElementType(MVT::i1);
29587 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
29588}
29589
29591 if (VT.isFixedLengthVector())
29592 return getPredicateForFixedLengthVector(DAG, DL, VT);
29593
29594 return getPredicateForScalableVector(DAG, DL, VT);
29595}
29596
29597// Grow V to consume an entire SVE register.
29599 assert(VT.isScalableVector() &&
29600 "Expected to convert into a scalable vector!");
29601 assert(V.getValueType().isFixedLengthVector() &&
29602 "Expected a fixed length vector operand!");
29603 SDLoc DL(V);
29604 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29605 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
29606}
29607
29608// Shrink V so it's just big enough to maintain a VT's worth of data.
29611 "Expected to convert into a fixed length vector!");
29612 assert(V.getValueType().isScalableVector() &&
29613 "Expected a scalable vector operand!");
29614 SDLoc DL(V);
29615 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29616 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
29617}
29618
29619// Convert all fixed length vector loads larger than NEON to masked_loads.
29620SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
29621 SDValue Op, SelectionDAG &DAG) const {
29622 auto Load = cast<LoadSDNode>(Op);
29623
29624 SDLoc DL(Op);
29625 EVT VT = Op.getValueType();
29626 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29627 EVT LoadVT = ContainerVT;
29628 EVT MemVT = Load->getMemoryVT();
29629
29630 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29631
29632 if (VT.isFloatingPoint()) {
29633 LoadVT = ContainerVT.changeTypeToInteger();
29634 MemVT = MemVT.changeTypeToInteger();
29635 }
29636
29637 SDValue NewLoad = DAG.getMaskedLoad(
29638 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
29639 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
29640 Load->getAddressingMode(), Load->getExtensionType());
29641
29642 SDValue Result = NewLoad;
29643 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
29644 EVT ExtendVT = ContainerVT.changeVectorElementType(
29645 Load->getMemoryVT().getVectorElementType());
29646
29647 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
29648 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29649 Pg, Result, DAG.getUNDEF(ContainerVT));
29650 } else if (VT.isFloatingPoint()) {
29651 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
29652 }
29653
29654 Result = convertFromScalableVector(DAG, VT, Result);
29655 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29656 return DAG.getMergeValues(MergedValues, DL);
29657}
29658
29660 SelectionDAG &DAG) {
29661 SDLoc DL(Mask);
29662 EVT InVT = Mask.getValueType();
29663 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29665
29666 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
29667 return Pg;
29668
29669 bool InvertCond = false;
29670 if (isBitwiseNot(Mask)) {
29671 InvertCond = true;
29672 Mask = Mask.getOperand(0);
29673 }
29674
29675 SDValue Op1, Op2;
29676 ISD::CondCode CC;
29677
29678 // When Mask is the result of a SETCC, it's better to regenerate the compare.
29679 if (Mask.getOpcode() == ISD::SETCC) {
29680 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
29681 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
29682 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
29683 } else {
29684 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
29685 Op2 = DAG.getConstant(0, DL, ContainerVT);
29686 CC = ISD::SETNE;
29687 }
29688
29689 if (InvertCond)
29690 CC = getSetCCInverse(CC, Op1.getValueType());
29691
29692 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
29693 {Pg, Op1, Op2, DAG.getCondCode(CC)});
29694}
29695
29696// Convert all fixed length vector loads larger than NEON to masked_loads.
29697SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
29698 SDValue Op, SelectionDAG &DAG) const {
29700
29701 SDLoc DL(Op);
29702 EVT VT = Op.getValueType();
29703 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29704
29705 SDValue Mask = Load->getMask();
29706 // If this is an extending load and the mask type is not the same as
29707 // load's type then we have to extend the mask type.
29708 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
29709 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
29710 "Incorrect mask type");
29711 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
29712 }
29714
29715 SDValue PassThru;
29716 bool IsPassThruZeroOrUndef = false;
29717
29718 if (Load->getPassThru()->isUndef()) {
29719 PassThru = DAG.getUNDEF(ContainerVT);
29720 IsPassThruZeroOrUndef = true;
29721 } else {
29722 if (ContainerVT.isInteger())
29723 PassThru = DAG.getConstant(0, DL, ContainerVT);
29724 else
29725 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
29726 if (isZerosVector(Load->getPassThru().getNode()))
29727 IsPassThruZeroOrUndef = true;
29728 }
29729
29730 SDValue NewLoad = DAG.getMaskedLoad(
29731 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
29732 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
29733 Load->getAddressingMode(), Load->getExtensionType());
29734
29735 SDValue Result = NewLoad;
29736 if (!IsPassThruZeroOrUndef) {
29737 SDValue OldPassThru =
29738 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
29739 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
29740 }
29741
29742 Result = convertFromScalableVector(DAG, VT, Result);
29743 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29744 return DAG.getMergeValues(MergedValues, DL);
29745}
29746
29747// Convert all fixed length vector stores larger than NEON to masked_stores.
29748SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
29749 SDValue Op, SelectionDAG &DAG) const {
29750 auto Store = cast<StoreSDNode>(Op);
29751
29752 SDLoc DL(Op);
29753 EVT VT = Store->getValue().getValueType();
29754 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29755 EVT MemVT = Store->getMemoryVT();
29756
29757 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29758 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29759
29760 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
29761 EVT TruncVT = ContainerVT.changeVectorElementType(
29762 Store->getMemoryVT().getVectorElementType());
29763 MemVT = MemVT.changeTypeToInteger();
29764 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
29765 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
29766 DAG.getUNDEF(TruncVT));
29767 NewValue =
29768 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29769 } else if (VT.isFloatingPoint()) {
29770 MemVT = MemVT.changeTypeToInteger();
29771 NewValue =
29772 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29773 }
29774
29775 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
29776 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
29777 Store->getMemOperand(), Store->getAddressingMode(),
29778 Store->isTruncatingStore());
29779}
29780
29781SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
29782 SDValue Op, SelectionDAG &DAG) const {
29784
29785 SDLoc DL(Op);
29786 EVT VT = Store->getValue().getValueType();
29787 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29788
29789 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29791
29792 return DAG.getMaskedStore(
29793 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
29794 Mask, Store->getMemoryVT(), Store->getMemOperand(),
29795 Store->getAddressingMode(), Store->isTruncatingStore());
29796}
29797
29798SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
29799 SDValue Op, SelectionDAG &DAG) const {
29800 SDLoc DL(Op);
29801 EVT VT = Op.getValueType();
29802 EVT EltVT = VT.getVectorElementType();
29803
29804 bool Signed = Op.getOpcode() == ISD::SDIV;
29805 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
29806
29807 bool Negated;
29808 uint64_t SplatVal;
29809 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
29810 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29811 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29812 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
29813
29815 SDValue Res =
29816 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
29817 if (Negated)
29818 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
29819 DAG.getConstant(0, DL, ContainerVT), Res);
29820
29821 return convertFromScalableVector(DAG, VT, Res);
29822 }
29823
29824 // Scalable vector i32/i64 DIV is supported.
29825 if (EltVT == MVT::i32 || EltVT == MVT::i64)
29826 return LowerToPredicatedOp(Op, DAG, PredOpcode);
29827
29828 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
29829 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
29830 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
29831 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29832
29833 // If the wider type is legal: extend, op, and truncate.
29834 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
29835 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
29836 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
29837 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
29838 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
29839 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
29840 }
29841
29842 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
29843 &ExtendOpcode](SDValue Op) {
29844 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
29845 SDValue IdxHalf =
29846 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
29847 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
29848 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
29849 return std::pair<SDValue, SDValue>(
29850 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
29851 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
29852 };
29853
29854 // If wider type is not legal: split, extend, op, trunc and concat.
29855 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
29856 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
29857 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
29858 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
29859 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
29860 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
29861 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
29862}
29863
29864SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
29865 SDValue Op, SelectionDAG &DAG) const {
29866 EVT VT = Op.getValueType();
29867 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29868
29869 SDLoc DL(Op);
29870 SDValue Val = Op.getOperand(0);
29871 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29872 Val = convertToScalableVector(DAG, ContainerVT, Val);
29873
29874 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
29875 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
29876
29877 // Repeatedly unpack Val until the result is of the desired element type.
29878 switch (ContainerVT.getSimpleVT().SimpleTy) {
29879 default:
29880 llvm_unreachable("unimplemented container type");
29881 case MVT::nxv16i8:
29882 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
29883 if (VT.getVectorElementType() == MVT::i16)
29884 break;
29885 [[fallthrough]];
29886 case MVT::nxv8i16:
29887 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
29888 if (VT.getVectorElementType() == MVT::i32)
29889 break;
29890 [[fallthrough]];
29891 case MVT::nxv4i32:
29892 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
29893 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
29894 break;
29895 }
29896
29897 return convertFromScalableVector(DAG, VT, Val);
29898}
29899
29900SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
29901 SDValue Op, SelectionDAG &DAG) const {
29902 EVT VT = Op.getValueType();
29903 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29904
29905 SDLoc DL(Op);
29906 SDValue Val = Op.getOperand(0);
29907 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29908 Val = convertToScalableVector(DAG, ContainerVT, Val);
29909
29910 // Repeatedly truncate Val until the result is of the desired element type.
29911 switch (ContainerVT.getSimpleVT().SimpleTy) {
29912 default:
29913 llvm_unreachable("unimplemented container type");
29914 case MVT::nxv2i64:
29915 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
29916 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
29917 if (VT.getVectorElementType() == MVT::i32)
29918 break;
29919 [[fallthrough]];
29920 case MVT::nxv4i32:
29921 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
29922 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
29923 if (VT.getVectorElementType() == MVT::i16)
29924 break;
29925 [[fallthrough]];
29926 case MVT::nxv8i16:
29927 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
29928 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
29929 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
29930 break;
29931 }
29932
29933 return convertFromScalableVector(DAG, VT, Val);
29934}
29935
29936SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
29937 SDValue Op, SelectionDAG &DAG) const {
29938 EVT VT = Op.getValueType();
29939 EVT InVT = Op.getOperand(0).getValueType();
29940 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
29941
29942 SDLoc DL(Op);
29943 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29944 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
29945
29946 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
29947}
29948
29949SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
29950 SDValue Op, SelectionDAG &DAG) const {
29951 EVT VT = Op.getValueType();
29952 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29953
29954 SDLoc DL(Op);
29955 EVT InVT = Op.getOperand(0).getValueType();
29956 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29957 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
29958
29959 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
29960 Op.getOperand(1), Op.getOperand(2));
29961
29962 return convertFromScalableVector(DAG, VT, ScalableRes);
29963}
29964
29965// Convert vector operation 'Op' to an equivalent predicated operation whereby
29966// the original operation's type is used to construct a suitable predicate.
29967// NOTE: The results for inactive lanes are undefined.
29968SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
29969 SelectionDAG &DAG,
29970 unsigned NewOp) const {
29971 EVT VT = Op.getValueType();
29972 SDLoc DL(Op);
29973 auto Pg = getPredicateForVector(DAG, DL, VT);
29974
29975 if (VT.isFixedLengthVector()) {
29976 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
29977 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29978
29979 // Create list of operands by converting existing ones to scalable types.
29981 for (const SDValue &V : Op->op_values()) {
29982 if (isa<CondCodeSDNode>(V)) {
29983 Operands.push_back(V);
29984 continue;
29985 }
29986
29987 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
29988 EVT VTArg = VTNode->getVT().getVectorElementType();
29989 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
29990 Operands.push_back(DAG.getValueType(NewVTArg));
29991 continue;
29992 }
29993
29994 assert(isTypeLegal(V.getValueType()) &&
29995 "Expected only legal fixed-width types");
29996 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
29997 }
29998
29999 if (isMergePassthruOpcode(NewOp))
30000 Operands.push_back(DAG.getUNDEF(ContainerVT));
30001
30002 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
30003 return convertFromScalableVector(DAG, VT, ScalableRes);
30004 }
30005
30006 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
30007
30009 for (const SDValue &V : Op->op_values()) {
30010 assert((!V.getValueType().isVector() ||
30011 V.getValueType().isScalableVector()) &&
30012 "Only scalable vectors are supported!");
30013 Operands.push_back(V);
30014 }
30015
30016 if (isMergePassthruOpcode(NewOp))
30017 Operands.push_back(DAG.getUNDEF(VT));
30018
30019 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
30020}
30021
30022// If a fixed length vector operation has no side effects when applied to
30023// undefined elements, we can safely use scalable vectors to perform the same
30024// operation without needing to worry about predication.
30025SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
30026 SelectionDAG &DAG) const {
30027 EVT VT = Op.getValueType();
30029 "Only expected to lower fixed length vector operation!");
30030 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30031
30032 // Create list of operands by converting existing ones to scalable types.
30034 for (const SDValue &V : Op->op_values()) {
30035 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
30036
30037 // Pass through non-vector operands.
30038 if (!V.getValueType().isVector()) {
30039 Ops.push_back(V);
30040 continue;
30041 }
30042
30043 // "cast" fixed length vector to a scalable vector.
30044 assert(V.getValueType().isFixedLengthVector() &&
30045 isTypeLegal(V.getValueType()) &&
30046 "Only fixed length vectors are supported!");
30047 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
30048 }
30049
30050 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
30051 return convertFromScalableVector(DAG, VT, ScalableRes);
30052}
30053
30054SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
30055 SelectionDAG &DAG) const {
30056 SDLoc DL(ScalarOp);
30057 SDValue AccOp = ScalarOp.getOperand(0);
30058 SDValue VecOp = ScalarOp.getOperand(1);
30059 EVT SrcVT = VecOp.getValueType();
30060 EVT ResVT = SrcVT.getVectorElementType();
30061
30062 EVT ContainerVT = SrcVT;
30063 if (SrcVT.isFixedLengthVector()) {
30064 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30065 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30066 }
30067
30068 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30069 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30070
30071 // Convert operands to Scalable.
30072 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
30073 DAG.getUNDEF(ContainerVT), AccOp, Zero);
30074
30075 // Perform reduction.
30076 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
30077 Pg, AccOp, VecOp);
30078
30079 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
30080}
30081
30082SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
30083 SelectionDAG &DAG) const {
30084 SDLoc DL(ReduceOp);
30085 SDValue Op = ReduceOp.getOperand(0);
30086 EVT OpVT = Op.getValueType();
30087 EVT VT = ReduceOp.getValueType();
30088
30089 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
30090 return SDValue();
30091
30092 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
30093
30094 switch (ReduceOp.getOpcode()) {
30095 default:
30096 return SDValue();
30097 case ISD::VECREDUCE_OR:
30098 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
30099 // The predicate can be 'Op' because
30100 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
30101 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
30102 else
30103 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
30104 case ISD::VECREDUCE_AND: {
30105 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
30106 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
30107 }
30108 case ISD::VECREDUCE_XOR: {
30109 SDValue ID =
30110 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
30111 if (OpVT == MVT::nxv1i1) {
30112 // Emulate a CNTP on .Q using .D and a different governing predicate.
30113 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
30114 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
30115 }
30116 SDValue Cntp =
30117 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
30118 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
30119 }
30120 }
30121
30122 return SDValue();
30123}
30124
30125SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
30126 SDValue ScalarOp,
30127 SelectionDAG &DAG) const {
30128 SDLoc DL(ScalarOp);
30129 SDValue VecOp = ScalarOp.getOperand(0);
30130 EVT SrcVT = VecOp.getValueType();
30131
30133 SrcVT,
30134 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
30135 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30136 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30137 }
30138
30139 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
30140 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
30141 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
30142 SDValue BoolVec = VecOp.getOperand(0);
30143 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
30144 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
30145 SDValue CntpOp = DAG.getNode(
30146 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
30147 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
30148 BoolVec, BoolVec);
30149 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
30150 }
30151 }
30152
30153 // UADDV always returns an i64 result.
30154 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
30155 SrcVT.getVectorElementType();
30156 EVT RdxVT = SrcVT;
30157 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
30158 RdxVT = getPackedSVEVectorVT(ResVT);
30159
30160 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30161 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
30162 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
30163 Rdx, DAG.getConstant(0, DL, MVT::i64));
30164
30165 // The VEC_REDUCE nodes expect an element size result.
30166 if (ResVT != ScalarOp.getValueType())
30167 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
30168
30169 return Res;
30170}
30171
30172SDValue
30173AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
30174 SelectionDAG &DAG) const {
30175 EVT VT = Op.getValueType();
30176 SDLoc DL(Op);
30177
30178 EVT InVT = Op.getOperand(1).getValueType();
30179 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30180 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
30181 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
30182
30183 // Convert the mask to a predicated (NOTE: We don't need to worry about
30184 // inactive lanes since VSELECT is safe when given undefined elements).
30185 EVT MaskVT = Op.getOperand(0).getValueType();
30186 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
30187 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
30189 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
30190
30191 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
30192 Mask, Op1, Op2);
30193
30194 return convertFromScalableVector(DAG, VT, ScalableRes);
30195}
30196
30197SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
30198 SDValue Op, SelectionDAG &DAG) const {
30199 SDLoc DL(Op);
30200 EVT InVT = Op.getOperand(0).getValueType();
30201 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30202
30203 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
30204 "Only expected to lower fixed length vector operation!");
30205 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
30206 "Expected integer result of the same bit length as the inputs!");
30207
30208 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30209 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
30210 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
30211
30212 EVT CmpVT = Pg.getValueType();
30213 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
30214 {Pg, Op1, Op2, Op.getOperand(2)});
30215
30216 EVT PromoteVT = ContainerVT.changeTypeToInteger();
30217 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
30218 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
30219}
30220
30221SDValue
30222AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
30223 SelectionDAG &DAG) const {
30224 SDLoc DL(Op);
30225 auto SrcOp = Op.getOperand(0);
30226 EVT VT = Op.getValueType();
30227 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30228 EVT ContainerSrcVT =
30230
30231 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
30232 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
30233 return convertFromScalableVector(DAG, VT, Op);
30234}
30235
30236SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
30237 SDValue Op, SelectionDAG &DAG) const {
30238 SDLoc DL(Op);
30239 unsigned NumOperands = Op->getNumOperands();
30240
30241 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
30242 "Unexpected number of operands in CONCAT_VECTORS");
30243
30244 auto SrcOp1 = Op.getOperand(0);
30245 auto SrcOp2 = Op.getOperand(1);
30246 EVT VT = Op.getValueType();
30247 EVT SrcVT = SrcOp1.getValueType();
30248
30249 // Match a splat of 128b segments that fit in a single register.
30250 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
30251 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30252 SDValue Splat =
30253 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
30254 convertToScalableVector(DAG, ContainerVT, SrcOp1),
30255 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
30256 return convertFromScalableVector(DAG, VT, Splat);
30257 }
30258
30259 if (NumOperands > 2) {
30261 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
30262 for (unsigned I = 0; I < NumOperands; I += 2)
30263 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
30264 Op->getOperand(I), Op->getOperand(I + 1)));
30265
30266 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
30267 }
30268
30269 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30270
30272 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
30273 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
30274
30275 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
30276
30277 return convertFromScalableVector(DAG, VT, Op);
30278}
30279
30280SDValue
30281AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
30282 SelectionDAG &DAG) const {
30283 EVT VT = Op.getValueType();
30284 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30285
30286 SDLoc DL(Op);
30287 SDValue Val = Op.getOperand(0);
30288 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30289 EVT SrcVT = Val.getValueType();
30290 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30291 EVT ExtendVT = ContainerVT.changeVectorElementType(
30292 SrcVT.getVectorElementType());
30293
30294 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30295 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
30296
30297 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
30298 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
30299 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30300 Pg, Val, DAG.getUNDEF(ContainerVT));
30301
30302 return convertFromScalableVector(DAG, VT, Val);
30303}
30304
30305SDValue
30306AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
30307 SelectionDAG &DAG) const {
30308 EVT VT = Op.getValueType();
30309 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30310
30311 SDLoc DL(Op);
30312 SDValue Val = Op.getOperand(0);
30313 EVT SrcVT = Val.getValueType();
30314 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30315 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
30317 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
30318
30319 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30320 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
30321 Op.getOperand(1), DAG.getUNDEF(RoundVT));
30322 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
30323 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30324
30325 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30326 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30327}
30328
30329SDValue
30330AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
30331 SelectionDAG &DAG) const {
30332 EVT VT = Op.getValueType();
30333 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30334
30335 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
30336 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
30337 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
30338
30339 SDLoc DL(Op);
30340 SDValue Val = Op.getOperand(0);
30341 EVT SrcVT = Val.getValueType();
30342 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30343 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30344
30345 if (VT.bitsGE(SrcVT)) {
30347
30348 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
30349 VT.changeTypeToInteger(), Val);
30350
30351 // Safe to use a larger than specified operand because by promoting the
30352 // value nothing has changed from an arithmetic point of view.
30353 Val =
30354 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
30355 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30356 DAG.getUNDEF(ContainerDstVT));
30357 return convertFromScalableVector(DAG, VT, Val);
30358 } else {
30359 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
30360 ContainerDstVT.getVectorElementType());
30362
30363 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30364 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30365 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
30366 Val = convertFromScalableVector(DAG, SrcVT, Val);
30367
30368 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30369 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30370 }
30371}
30372
30373SDValue
30374AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
30375 SelectionDAG &DAG) const {
30376 SDLoc DL(Op);
30377 EVT OpVT = Op.getValueType();
30378 assert(OpVT.isScalableVector() &&
30379 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
30380
30381 // Are multi-register uzp instructions available?
30382 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30383 OpVT.getVectorElementType() != MVT::i1) {
30384 Intrinsic::ID IntID;
30385 switch (Op->getNumOperands()) {
30386 default:
30387 return SDValue();
30388 case 2:
30389 IntID = Intrinsic::aarch64_sve_uzp_x2;
30390 break;
30391 case 4:
30392 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30393 OpVT.getScalarSizeInBits() == 64)
30394 return SDValue();
30395 IntID = Intrinsic::aarch64_sve_uzp_x4;
30396 break;
30397 }
30398
30400 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30401 Ops.append(Op->op_values().begin(), Op->op_values().end());
30402 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30403 }
30404
30405 if (Op->getNumOperands() != 2)
30406 return SDValue();
30407
30408 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
30409 Op.getOperand(1));
30410 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
30411 Op.getOperand(1));
30412 return DAG.getMergeValues({Even, Odd}, DL);
30413}
30414
30415SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
30416 SelectionDAG &DAG) const {
30417 SDLoc DL(Op);
30418 EVT OpVT = Op.getValueType();
30419 assert(OpVT.isScalableVector() &&
30420 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
30421
30422 // Are multi-register zip instructions available?
30423 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30424 OpVT.getVectorElementType() != MVT::i1) {
30425 Intrinsic::ID IntID;
30426 switch (Op->getNumOperands()) {
30427 default:
30428 return SDValue();
30429 case 2:
30430 IntID = Intrinsic::aarch64_sve_zip_x2;
30431 break;
30432 case 4:
30433 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30434 OpVT.getScalarSizeInBits() == 64)
30435 return SDValue();
30436 IntID = Intrinsic::aarch64_sve_zip_x4;
30437 break;
30438 }
30439
30441 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30442 Ops.append(Op->op_values().begin(), Op->op_values().end());
30443 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30444 }
30445
30446 if (Op->getNumOperands() != 2)
30447 return SDValue();
30448
30449 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
30450 Op.getOperand(1));
30451 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
30452 Op.getOperand(1));
30453 return DAG.getMergeValues({Lo, Hi}, DL);
30454}
30455
30456SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
30457 SelectionDAG &DAG) const {
30458 // FIXME: Maybe share some code with LowerMGather/Scatter?
30459 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
30460 SDLoc DL(HG);
30461 SDValue Chain = HG->getChain();
30462 SDValue Inc = HG->getInc();
30463 SDValue Mask = HG->getMask();
30464 SDValue Ptr = HG->getBasePtr();
30465 SDValue Index = HG->getIndex();
30466 SDValue Scale = HG->getScale();
30467 SDValue IntID = HG->getIntID();
30468
30469 // The Intrinsic ID determines the type of update operation.
30470 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
30471 // Right now, we only support 'add' as an update.
30472 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
30473 "Unexpected histogram update operation");
30474
30475 EVT IndexVT = Index.getValueType();
30476 LLVMContext &Ctx = *DAG.getContext();
30477 ElementCount EC = IndexVT.getVectorElementCount();
30478 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
30479 EVT IncExtVT =
30480 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
30481 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
30482 bool ExtTrunc = IncSplatVT != MemVT;
30483
30484 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30485 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
30486 SDValue IncSplat = DAG.getSplatVector(
30487 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
30488 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
30489
30490 MachineMemOperand *MMO = HG->getMemOperand();
30491 // Create an MMO for the gather, without load|store flags.
30492 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
30494 MMO->getAlign(), MMO->getAAInfo());
30495 ISD::MemIndexType IndexType = HG->getIndexType();
30496 SDValue Gather = DAG.getMaskedGather(
30497 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
30498 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
30499
30500 SDValue GChain = Gather.getValue(1);
30501
30502 // Perform the histcnt, multiply by inc, add to bucket data.
30503 SDValue ID =
30504 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
30505 SDValue HistCnt =
30506 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
30507 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
30508 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
30509
30510 // Create an MMO for the scatter, without load|store flags.
30511 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
30513 MMO->getAlign(), MMO->getAAInfo());
30514
30515 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
30516 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
30517 ScatterOps, SMMO, IndexType, ExtTrunc);
30518 return Scatter;
30519}
30520
30521/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
30522/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
30523/// however still make use of the dot product instruction by instead
30524/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
30525/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
30526/// the following pattern is emitted:
30527/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
30528/// NTy/2))))
30529SDValue
30530AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
30531 SelectionDAG &DAG) const {
30532 SDLoc DL(Op);
30533
30534 SDValue Acc = Op.getOperand(0);
30535 SDValue LHS = Op.getOperand(1);
30536 SDValue RHS = Op.getOperand(2);
30537 EVT ResultVT = Op.getValueType();
30538 EVT OrigResultVT = ResultVT;
30539 EVT OpVT = LHS.getValueType();
30540
30541 bool ConvertToScalable =
30542 ResultVT.isFixedLengthVector() &&
30543 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
30544
30545 if (ConvertToScalable) {
30546 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
30547 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
30548 Acc = convertToScalableVector(DAG, ResultVT, Acc);
30549 LHS = convertToScalableVector(DAG, OpVT, LHS);
30550 RHS = convertToScalableVector(DAG, OpVT, RHS);
30551 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
30552 }
30553
30554 // Two-way and four-way partial reductions are supported by patterns.
30555 // We only need to handle the 8-way partial reduction.
30556 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
30557 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
30558 : Op;
30559
30560 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
30561 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
30562 DAG.getConstant(0, DL, DotVT), LHS, RHS);
30563
30564 SDValue Res;
30565 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
30566 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
30567 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
30568 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
30569 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
30570 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
30571 } else {
30572 // Fold (nx)v4i32 into (nx)v2i64
30573 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
30574 if (IsUnsigned) {
30575 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
30576 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
30577 } else {
30578 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
30579 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
30580 }
30581 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
30582 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
30583 }
30584
30585 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
30586 : Res;
30587}
30588
30589SDValue
30590AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
30591 SelectionDAG &DAG) const {
30592 EVT VT = Op.getValueType();
30593 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30594
30595 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
30596 "Lowering fixed length get_active_lane_mask requires SVE!");
30597
30598 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
30599 // but we can use SVE when available.
30600
30601 SDLoc DL(Op);
30602 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30603 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
30604
30605 SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,
30606 Op.getOperand(0), Op.getOperand(1));
30607 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
30608 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
30609 DAG.getVectorIdxConstant(0, DL));
30610}
30611
30612SDValue
30613AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
30614 SelectionDAG &DAG) const {
30615 EVT VT = Op.getValueType();
30616 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30617
30618 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
30619 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
30620 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
30621
30622 SDLoc DL(Op);
30623 SDValue Val = Op.getOperand(0);
30624 EVT SrcVT = Val.getValueType();
30625 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30626 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30627
30628 if (VT.bitsGT(SrcVT)) {
30629 EVT CvtVT = ContainerDstVT.changeVectorElementType(
30630 ContainerSrcVT.getVectorElementType());
30632
30633 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30634 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
30635
30636 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
30637 Val = getSVESafeBitCast(CvtVT, Val, DAG);
30638 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30639 DAG.getUNDEF(ContainerDstVT));
30640 return convertFromScalableVector(DAG, VT, Val);
30641 } else {
30642 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
30644
30645 // Safe to use a larger than specified result since an fp_to_int where the
30646 // result doesn't fit into the destination is undefined.
30647 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30648 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30649 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30650
30651 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
30652 }
30653}
30654
30656 ArrayRef<int> ShuffleMask, EVT VT,
30657 EVT ContainerVT, SelectionDAG &DAG) {
30658 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30659 SDLoc DL(Op);
30660 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30661 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30662 bool IsSingleOp =
30663 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
30664
30665 if (!Subtarget.isNeonAvailable() && !MinSVESize)
30666 MinSVESize = 128;
30667
30668 // Ignore two operands if no SVE2 or all index numbers couldn't
30669 // be represented.
30670 if (!IsSingleOp && !Subtarget.hasSVE2())
30671 return SDValue();
30672
30673 EVT VTOp1 = Op.getOperand(0).getValueType();
30674 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
30675 unsigned IndexLen = MinSVESize / BitsPerElt;
30676 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
30677 uint64_t MaxOffset = maxUIntN(BitsPerElt);
30678 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
30679 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
30680 bool MinMaxEqual = (MinSVESize == MaxSVESize);
30681 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
30682 "Incorrectly legalised shuffle operation");
30683
30685 // If MinSVESize is not equal to MaxSVESize then we need to know which
30686 // TBL mask element needs adjustment.
30687 SmallVector<SDValue, 8> AddRuntimeVLMask;
30688
30689 // Bail out for 8-bits element types, because with 2048-bit SVE register
30690 // size 8 bits is only sufficient to index into the first source vector.
30691 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
30692 return SDValue();
30693
30694 for (int Index : ShuffleMask) {
30695 // Handling poison index value.
30696 if (Index < 0)
30697 Index = 0;
30698 // If the mask refers to elements in the second operand, then we have to
30699 // offset the index by the number of elements in a vector. If this is number
30700 // is not known at compile-time, we need to maintain a mask with 'VL' values
30701 // to add at runtime.
30702 if ((unsigned)Index >= ElementsPerVectorReg) {
30703 if (MinMaxEqual) {
30704 Index += IndexLen - ElementsPerVectorReg;
30705 } else {
30706 Index = Index - ElementsPerVectorReg;
30707 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
30708 }
30709 } else if (!MinMaxEqual)
30710 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30711 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
30712 // to 255, this might point to the last element of in the second operand
30713 // of the shufflevector, thus we are rejecting this transform.
30714 if ((unsigned)Index >= MaxOffset)
30715 return SDValue();
30716 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
30717 }
30718
30719 // Choosing an out-of-range index leads to the lane being zeroed vs zero
30720 // value where it would perform first lane duplication for out of
30721 // index elements. For i8 elements an out-of-range index could be a valid
30722 // for 2048-bit vector register size.
30723 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
30724 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
30725 if (!MinMaxEqual)
30726 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30727 }
30728
30729 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
30730 SDValue VecMask =
30731 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30732 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
30733
30734 SDValue Shuffle;
30735 if (IsSingleOp)
30736 Shuffle =
30737 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30738 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
30739 Op1, SVEMask);
30740 else if (Subtarget.hasSVE2()) {
30741 if (!MinMaxEqual) {
30742 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
30743 SDValue VScale = (BitsPerElt == 64)
30744 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
30745 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
30746 SDValue VecMask =
30747 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30748 SDValue MulByMask = DAG.getNode(
30749 ISD::MUL, DL, MaskType,
30750 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
30751 DAG.getBuildVector(MaskType, DL,
30752 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
30753 SDValue UpdatedVecMask =
30754 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
30755 SVEMask = convertToScalableVector(
30756 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
30757 }
30758 Shuffle =
30759 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30760 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
30761 Op1, Op2, SVEMask);
30762 }
30763 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
30764 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
30765}
30766
30767SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
30768 SDValue Op, SelectionDAG &DAG) const {
30769 EVT VT = Op.getValueType();
30770 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30771
30772 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
30773 auto ShuffleMask = SVN->getMask();
30774
30775 SDLoc DL(Op);
30776 SDValue Op1 = Op.getOperand(0);
30777 SDValue Op2 = Op.getOperand(1);
30778
30779 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30780 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
30781 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
30782
30783 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
30784 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
30785 return MVT::i32;
30786 return ScalarTy;
30787 };
30788
30789 if (SVN->isSplat()) {
30790 unsigned Lane = std::max(0, SVN->getSplatIndex());
30791 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30792 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30793 DAG.getConstant(Lane, DL, MVT::i64));
30794 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
30795 return convertFromScalableVector(DAG, VT, Op);
30796 }
30797
30798 bool ReverseEXT = false;
30799 unsigned Imm;
30800 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
30801 Imm == VT.getVectorNumElements() - 1) {
30802 if (ReverseEXT)
30803 std::swap(Op1, Op2);
30804 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30805 SDValue Scalar = DAG.getNode(
30806 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30807 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
30808 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
30809 return convertFromScalableVector(DAG, VT, Op);
30810 }
30811
30812 unsigned EltSize = VT.getScalarSizeInBits();
30813 for (unsigned BlockSize : {64U, 32U, 16U}) {
30814 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
30815 unsigned RevOp;
30816 if (EltSize == 8)
30817 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
30818 else if (EltSize == 16)
30819 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
30820 else
30821 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
30822 EVT BlockedVT =
30824 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
30825 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
30826 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
30827 DAG.getUNDEF(BlockedVT));
30828 SDValue Container =
30829 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
30830 return convertFromScalableVector(DAG, VT, Container);
30831 }
30832 }
30833
30834 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
30835 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
30836 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30837 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
30838 Pg, Op1, DAG.getUNDEF(ContainerVT));
30839 return convertFromScalableVector(DAG, VT, Revd);
30840 }
30841
30842 unsigned WhichResult;
30843 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30844 WhichResult == 0)
30846 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
30847
30848 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30849 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30851 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30852 }
30853
30854 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
30856 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
30857
30858 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30859 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30861 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30862 }
30863
30864 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
30865 // represents the same logical operation as performed by a ZIP instruction. In
30866 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
30867 // equivalent to an AArch64 instruction. There's the extra component of
30868 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
30869 // only operated on 64/128bit vector types that have a direct mapping to a
30870 // target register and so an exact mapping is implied.
30871 // However, when using SVE for fixed length vectors, most legal vector types
30872 // are actually sub-vectors of a larger SVE register. When mapping
30873 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
30874 // how the mask's indices translate. Specifically, when the mapping requires
30875 // an exact meaning for a specific vector index (e.g. Index X is the last
30876 // vector element in the register) then such mappings are often only safe when
30877 // the exact SVE register size is know. The main exception to this is when
30878 // indices are logically relative to the first element of either
30879 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
30880 // when converting from fixed-length to scalable vector types (i.e. the start
30881 // of a fixed length vector is always the start of a scalable vector).
30882 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
30883 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
30884 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
30885 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
30886 Op2.isUndef()) {
30887 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
30888 return convertFromScalableVector(DAG, VT, Op);
30889 }
30890
30891 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30892 WhichResult != 0)
30894 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
30895
30896 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30897 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30899 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30900 }
30901
30902 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
30904 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
30905
30906 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30907 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30909 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30910 }
30911
30912 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
30913 Subtarget->isSVEorStreamingSVEAvailable()) {
30915 "Unsupported SVE vector size");
30916
30918 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
30919 if (std::optional<unsigned> Lane =
30920 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
30921 SDValue IID =
30922 DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
30924 DAG, VT,
30925 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30926 {IID, Op1,
30927 DAG.getConstant(*Lane, DL, MVT::i64,
30928 /*isTarget=*/true)}));
30929 }
30930 }
30931 }
30932
30933 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
30934 // This may allow the shuffle to be matched as something cheaper like ZIP1.
30935 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
30936 return WideOp;
30937
30938 // Avoid producing TBL instruction if we don't know SVE register minimal size,
30939 // unless NEON is not available and we can assume minimal SVE register size is
30940 // 128-bits.
30941 if (MinSVESize || !Subtarget->isNeonAvailable())
30942 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
30943 DAG);
30944
30945 return SDValue();
30946}
30947
30948SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
30949 SelectionDAG &DAG) const {
30950 SDLoc DL(Op);
30951 EVT InVT = Op.getValueType();
30952
30953 assert(VT.isScalableVector() && isTypeLegal(VT) &&
30954 InVT.isScalableVector() && isTypeLegal(InVT) &&
30955 "Only expect to cast between legal scalable vector types!");
30956 assert(VT.getVectorElementType() != MVT::i1 &&
30957 InVT.getVectorElementType() != MVT::i1 &&
30958 "For predicate bitcasts, use getSVEPredicateBitCast");
30959
30960 if (InVT == VT)
30961 return Op;
30962
30963 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
30964 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
30965
30966 // Safe bitcasting between unpacked vector types of different element counts
30967 // is currently unsupported because the following is missing the necessary
30968 // work to ensure the result's elements live where they're supposed to within
30969 // an SVE register.
30970 // 01234567
30971 // e.g. nxv2i32 = XX??XX??
30972 // nxv4f16 = X?X?X?X?
30974 VT == PackedVT || InVT == PackedInVT) &&
30975 "Unexpected bitcast!");
30976
30977 // Pack input if required.
30978 if (InVT != PackedInVT)
30979 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
30980
30981 if (Subtarget->isLittleEndian() ||
30982 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
30983 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
30984 else {
30985 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
30986 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
30987
30988 // Simulate the effect of casting through memory.
30989 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
30990 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
30991 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
30992 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
30993 if (PackedVTAsInt.getScalarSizeInBits() != 8)
30994 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
30995 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
30996 }
30997
30998 // Unpack result if required.
30999 if (VT != PackedVT)
31000 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
31001
31002 return Op;
31003}
31004
31006 SDValue N) const {
31007 return ::isAllActivePredicate(DAG, N);
31008}
31009
31011 return ::getPromotedVTForPredicate(VT);
31012}
31013
31014bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
31015 SDValue Op, const APInt &OriginalDemandedBits,
31016 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
31017 unsigned Depth) const {
31018
31019 unsigned Opc = Op.getOpcode();
31020 switch (Opc) {
31021 case AArch64ISD::VSHL: {
31022 // Match (VSHL (VLSHR Val X) X)
31023 SDValue ShiftL = Op;
31024 SDValue ShiftR = Op->getOperand(0);
31025 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
31026 return false;
31027
31028 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
31029 return false;
31030
31031 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
31032 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
31033
31034 // Other cases can be handled as well, but this is not
31035 // implemented.
31036 if (ShiftRBits != ShiftLBits)
31037 return false;
31038
31039 unsigned ScalarSize = Op.getScalarValueSizeInBits();
31040 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
31041
31042 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
31043 APInt UnusedBits = ~OriginalDemandedBits;
31044
31045 if ((ZeroBits & UnusedBits) != ZeroBits)
31046 return false;
31047
31048 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
31049 // used - simplify to just Val.
31050 return TLO.CombineTo(Op, ShiftR->getOperand(0));
31051 }
31052 case AArch64ISD::BICi: {
31053 // Fold BICi if all destination bits already known to be zeroed
31054 SDValue Op0 = Op.getOperand(0);
31055 KnownBits KnownOp0 =
31056 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
31057 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
31058 APInt BitsToClear =
31059 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
31060 .trunc(KnownOp0.getBitWidth());
31061 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
31062 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
31063 return TLO.CombineTo(Op, Op0);
31064
31065 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
31066 return false;
31067 }
31069 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
31070 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
31071 if (!MaxSVEVectorSizeInBits)
31072 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
31073 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
31074 // The SVE count intrinsics don't support the multiplier immediate so we
31075 // don't have to account for that here. The value returned may be slightly
31076 // over the true required bits, as this is based on the "ALL" pattern. The
31077 // other patterns are also exposed by these intrinsics, but they all
31078 // return a value that's strictly less than "ALL".
31079 unsigned RequiredBits = llvm::bit_width(MaxElements);
31080 unsigned BitWidth = Known.Zero.getBitWidth();
31081 if (RequiredBits < BitWidth)
31082 Known.Zero.setHighBits(BitWidth - RequiredBits);
31083 return false;
31084 }
31085 }
31086 }
31087
31089 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
31090}
31091
31092bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
31093 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
31094 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
31095
31096 // TODO: Add more target nodes.
31097 switch (Op.getOpcode()) {
31098 case AArch64ISD::MOVI:
31099 case AArch64ISD::MOVIedit:
31100 case AArch64ISD::MOVImsl:
31101 case AArch64ISD::MOVIshift:
31102 case AArch64ISD::MVNImsl:
31103 case AArch64ISD::MVNIshift:
31104 case AArch64ISD::VASHR:
31105 case AArch64ISD::VLSHR:
31106 case AArch64ISD::VSHL:
31107 return false;
31108 }
31110 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
31111}
31112
31113bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
31114 return Op.getOpcode() == AArch64ISD::DUP ||
31115 Op.getOpcode() == AArch64ISD::MOVI ||
31116 Op.getOpcode() == AArch64ISD::MOVIshift ||
31117 Op.getOpcode() == AArch64ISD::MOVImsl ||
31118 Op.getOpcode() == AArch64ISD::MOVIedit ||
31119 Op.getOpcode() == AArch64ISD::MVNIshift ||
31120 Op.getOpcode() == AArch64ISD::MVNImsl ||
31121 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
31122 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
31123 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
31124 (Op.getOpcode() == ISD::FNEG &&
31125 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
31126 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
31127 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
31128 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
31130}
31131
31133 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
31134 Subtarget->hasComplxNum();
31135}
31136
31139 auto *VTy = dyn_cast<VectorType>(Ty);
31140 if (!VTy)
31141 return false;
31142
31143 // If the vector is scalable, SVE is enabled, implying support for complex
31144 // numbers. Otherwise, we need to ensure complex number support is available
31145 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
31146 return false;
31147
31148 auto *ScalarTy = VTy->getScalarType();
31149 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
31150
31151 // We can only process vectors that have a bit size of 128 or higher (with an
31152 // additional 64 bits for Neon). Additionally, these vectors must have a
31153 // power-of-2 size, as we later split them into the smallest supported size
31154 // and merging them back together after applying complex operation.
31155 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
31156 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
31157 !llvm::isPowerOf2_32(VTyWidth))
31158 return false;
31159
31160 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
31161 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
31162
31164 return ScalarWidth == 32 || ScalarWidth == 64;
31165 return 8 <= ScalarWidth && ScalarWidth <= 64;
31166 }
31167
31168 // CDot is not supported outside of scalable/sve scopes
31170 return false;
31171
31172 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
31173 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
31174}
31175
31178 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
31179 Value *Accumulator) const {
31180 VectorType *Ty = cast<VectorType>(InputA->getType());
31181 if (Accumulator == nullptr)
31183 bool IsScalable = Ty->isScalableTy();
31184 bool IsInt = Ty->getElementType()->isIntegerTy();
31185
31186 unsigned TyWidth =
31187 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
31188
31189 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
31190 "Vector type must be either 64 or a power of 2 that is at least 128");
31191
31192 if (TyWidth > 128) {
31193 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
31194 int AccStride = cast<VectorType>(Accumulator->getType())
31195 ->getElementCount()
31196 .getKnownMinValue() /
31197 2;
31198 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
31199 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
31200 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
31201 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
31202 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
31203 Value *LowerSplitAcc = nullptr;
31204 Value *UpperSplitAcc = nullptr;
31205 Type *FullTy = Ty;
31206 FullTy = Accumulator->getType();
31207 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
31208 cast<VectorType>(Accumulator->getType()));
31209 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
31210 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
31211 auto *LowerSplitInt = createComplexDeinterleavingIR(
31212 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
31213 auto *UpperSplitInt = createComplexDeinterleavingIR(
31214 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
31215
31216 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
31217 LowerSplitInt, uint64_t(0));
31218 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
31219 }
31220
31221 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
31222 if (IsScalable) {
31223 if (IsInt)
31224 return B.CreateIntrinsic(
31225 Intrinsic::aarch64_sve_cmla_x, Ty,
31226 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31227
31228 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31229 return B.CreateIntrinsic(
31230 Intrinsic::aarch64_sve_fcmla, Ty,
31231 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31232 }
31233
31234 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
31235 Intrinsic::aarch64_neon_vcmla_rot90,
31236 Intrinsic::aarch64_neon_vcmla_rot180,
31237 Intrinsic::aarch64_neon_vcmla_rot270};
31238
31239
31240 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
31241 {Accumulator, InputA, InputB});
31242 }
31243
31244 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
31245 if (IsScalable) {
31248 if (IsInt)
31249 return B.CreateIntrinsic(
31250 Intrinsic::aarch64_sve_cadd_x, Ty,
31251 {InputA, InputB, B.getInt32((int)Rotation * 90)});
31252
31253 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31254 return B.CreateIntrinsic(
31255 Intrinsic::aarch64_sve_fcadd, Ty,
31256 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
31257 }
31258 return nullptr;
31259 }
31260
31263 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
31265 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
31266
31267 if (IntId == Intrinsic::not_intrinsic)
31268 return nullptr;
31269
31270 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
31271 }
31272
31273 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
31274 IsScalable) {
31275 return B.CreateIntrinsic(
31276 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
31277 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31278 }
31279
31280 return nullptr;
31281}
31282
31283bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
31284 unsigned Opc = N->getOpcode();
31285 if (ISD::isExtOpcode(Opc)) {
31286 if (any_of(N->users(),
31287 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
31288 return false;
31289 }
31290 return true;
31291}
31292
31294 return Subtarget->getMinimumJumpTableEntries();
31295}
31296
31298 CallingConv::ID CC,
31299 EVT VT) const {
31300 bool NonUnitFixedLengthVector =
31302 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31303 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
31304
31305 EVT VT1;
31306 MVT RegisterVT;
31307 unsigned NumIntermediates;
31308 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
31309 RegisterVT);
31310 return RegisterVT;
31311}
31312
31314 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
31315 bool NonUnitFixedLengthVector =
31317 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31318 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
31319
31320 EVT VT1;
31321 MVT VT2;
31322 unsigned NumIntermediates;
31323 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
31324 NumIntermediates, VT2);
31325}
31326
31328 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
31329 unsigned &NumIntermediates, MVT &RegisterVT) const {
31331 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
31332 if (!RegisterVT.isFixedLengthVector() ||
31333 RegisterVT.getFixedSizeInBits() <= 128)
31334 return NumRegs;
31335
31336 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
31337 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
31338 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
31339
31340 // A size mismatch here implies either type promotion or widening and would
31341 // have resulted in scalarisation if larger vectors had not be available.
31342 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
31343 EVT EltTy = VT.getVectorElementType();
31344 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
31345 if (!isTypeLegal(NewVT))
31346 NewVT = EltTy;
31347
31348 IntermediateVT = NewVT;
31349 NumIntermediates = VT.getVectorNumElements();
31350 RegisterVT = getRegisterType(Context, NewVT);
31351 return NumIntermediates;
31352 }
31353
31354 // SVE VLS support does not introduce a new ABI so we should use NEON sized
31355 // types for vector arguments and returns.
31356
31357 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
31358 NumIntermediates *= NumSubRegs;
31359 NumRegs *= NumSubRegs;
31360
31361 switch (RegisterVT.getVectorElementType().SimpleTy) {
31362 default:
31363 llvm_unreachable("unexpected element type for vector");
31364 case MVT::i8:
31365 IntermediateVT = RegisterVT = MVT::v16i8;
31366 break;
31367 case MVT::i16:
31368 IntermediateVT = RegisterVT = MVT::v8i16;
31369 break;
31370 case MVT::i32:
31371 IntermediateVT = RegisterVT = MVT::v4i32;
31372 break;
31373 case MVT::i64:
31374 IntermediateVT = RegisterVT = MVT::v2i64;
31375 break;
31376 case MVT::f16:
31377 IntermediateVT = RegisterVT = MVT::v8f16;
31378 break;
31379 case MVT::f32:
31380 IntermediateVT = RegisterVT = MVT::v4f32;
31381 break;
31382 case MVT::f64:
31383 IntermediateVT = RegisterVT = MVT::v2f64;
31384 break;
31385 case MVT::bf16:
31386 IntermediateVT = RegisterVT = MVT::v8bf16;
31387 break;
31388 }
31389
31390 return NumRegs;
31391}
31392
31394 const MachineFunction &MF) const {
31395 return !Subtarget->isTargetWindows() &&
31396 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
31397}
31398
31400 switch (Opc) {
31404 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
31405 return true;
31406 }
31407
31409}
31410
31412 EVT VT) const {
31413 return Subtarget->hasCPA() && UseFEATCPACodegen;
31414}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
@ Default
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
mir Rename Register Operands
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override
Return true if the @llvm.vector.partial.reduce.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1166
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
an instruction to allocate memory on the stack
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ And
*p = old & v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:935
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:198
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
arg_iterator arg_end()
Definition Function.h:875
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
const Argument * const_arg_iterator
Definition Function.h:73
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1936
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2251
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:56
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:712
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:480
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:581
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:269
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:619
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:694
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:281
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:299
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
@ HalfTyID
16-bit floating point type
Definition Type.h:56
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:283
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ LOOP_DEPENDENCE_RAW_MASK
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:855
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:622
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:682
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:627
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:648
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:611
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:853
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:857
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
@ LOOP_DEPENDENCE_WAR_MASK
Set rounding mode.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
const unsigned VectorBits
Definition SystemZ.h:154
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
bool isPackedVectorType(EVT SomeVT)
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2100
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
static const MachineMemOperand::Flags MOStridedAccess
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:207
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:294
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:154
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:304
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:128
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64